#!/usr/bin/env python3
import math
from pathlib import Path
from xml.sax.saxutils import escape
# ====== CONFIGURE THIS ======
BASE_URL = "http://pineapple.whatbox.ca:20123" # no trailing slash
INPUT = "files.txt"
SITEMAP_BASENAME = "sitemap" # sitemap-tal-1.xml, etc.
MAX_URLS_PER_SITEMAP = 50000 # protocol limit is 50,000
#
# I use this command to omit dirs/files from files.txt
# find -L . \( -path './pub/files/Head-Neck-CT' -o -path './pub/files/eurosec2020-pandacap-dataset' \) -prune -o -type f -print | sort -u > files.txt
#
# =============================
paths = []
with open(INPUT, "r", encoding="utf-8") as f:
for line in f:
p = line.strip()
if not p:
continue
# strip leading "./"
if p.startswith("./"):
p = p[2:]
# Only include real web URLs; optional filter by extension:
# if not p.endswith((".html", ".htm", ".pdf", ".epub")):
# continue
paths.append(p)
total = len(paths)
print(f"Found {total} paths")
# Split into chunks
def chunks(lst, size):
for i in range(0, len(lst), size):
yield lst[i:i + size]
out_files = []
sitemaps_dir = Path(".")
for i, chunk in enumerate(chunks(paths, MAX_URLS_PER_SITEMAP), start=1):
fname = f"{SITEMAP_BASENAME}-{i}.xml"
out_path = sitemaps_dir / fname
out_files.append(fname)
with out_path.open("w", encoding="utf-8") as f:
f.write('\n')
f.write('\n')
for rel_path in chunk:
# Build absolute URL
# Ensure exactly one slash between BASE_URL and rel_path
url = BASE_URL.rstrip("/") + "/" + rel_path.lstrip("/")
f.write(" \n")
f.write(f" {escape(url)}\n")
# is optional; you can add it if you want
f.write(" \n")
f.write("\n")
print(f"Wrote {fname} with {len(chunk)} URLs")
# Create sitemap index
index_name = f"{SITEMAP_BASENAME}-index.xml"
index_path = sitemaps_dir / index_name
with index_path.open("w", encoding="utf-8") as f:
f.write('\n')
f.write('\n')
for fname in out_files:
loc = BASE_URL.rstrip("/") + "/" + fname
f.write(" \n")
f.write(f" {escape(loc)}\n")
# optional; you can add one if you want
f.write(" \n")
f.write("\n")
print(f"Wrote {index_name} referencing {len(out_files)} sitemap files")