#!/usr/bin/env python3 import math from pathlib import Path from xml.sax.saxutils import escape # ====== CONFIGURE THIS ====== BASE_URL = "http://pineapple.whatbox.ca:20123" # no trailing slash INPUT = "files.txt" SITEMAP_BASENAME = "sitemap" # sitemap-tal-1.xml, etc. MAX_URLS_PER_SITEMAP = 50000 # protocol limit is 50,000 # # I use this command to omit dirs/files from files.txt # find -L . \( -path './pub/files/Head-Neck-CT' -o -path './pub/files/eurosec2020-pandacap-dataset' \) -prune -o -type f -print | sort -u > files.txt # # ============================= paths = [] with open(INPUT, "r", encoding="utf-8") as f: for line in f: p = line.strip() if not p: continue # strip leading "./" if p.startswith("./"): p = p[2:] # Only include real web URLs; optional filter by extension: # if not p.endswith((".html", ".htm", ".pdf", ".epub")): # continue paths.append(p) total = len(paths) print(f"Found {total} paths") # Split into chunks def chunks(lst, size): for i in range(0, len(lst), size): yield lst[i:i + size] out_files = [] sitemaps_dir = Path(".") for i, chunk in enumerate(chunks(paths, MAX_URLS_PER_SITEMAP), start=1): fname = f"{SITEMAP_BASENAME}-{i}.xml" out_path = sitemaps_dir / fname out_files.append(fname) with out_path.open("w", encoding="utf-8") as f: f.write('\n') f.write('\n') for rel_path in chunk: # Build absolute URL # Ensure exactly one slash between BASE_URL and rel_path url = BASE_URL.rstrip("/") + "/" + rel_path.lstrip("/") f.write(" \n") f.write(f" {escape(url)}\n") # is optional; you can add it if you want f.write(" \n") f.write("\n") print(f"Wrote {fname} with {len(chunk)} URLs") # Create sitemap index index_name = f"{SITEMAP_BASENAME}-index.xml" index_path = sitemaps_dir / index_name with index_path.open("w", encoding="utf-8") as f: f.write('\n') f.write('\n') for fname in out_files: loc = BASE_URL.rstrip("/") + "/" + fname f.write(" \n") f.write(f" {escape(loc)}\n") # optional; you can add one if you want f.write(" \n") f.write("\n") print(f"Wrote {index_name} referencing {len(out_files)} sitemap files")