#!/usr/bin/env python3
"""WWII source harvester / normalizer for AP Second World War Operational Atlas.

Purpose:
- keep source metadata separate from event claims;
- normalize LOC/NARA/Wikidata/OHM records into source_point GeoJSON;
- write chunks by year, theater, and source for lazy loading.

This script is intentionally conservative. API records should be marked as
source metadata until reviewed and promoted into event/frontline geometry.
"""
from __future__ import annotations
import argparse, datetime, hashlib, json, re, urllib.parse, urllib.request
from collections import defaultdict
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]

def slug(value: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", str(value).lower()).strip("-") or "unknown"

def year_of(date_value: str) -> str:
    match = re.search(r"(19[3-4][0-9]|1945)", str(date_value or ""))
    return match.group(1) if match else "undated"

def source_feature(record_id, title, lon, lat, date, theater, source, url, category, **extra):
    props = {
        "id": record_id,
        "name": title,
        "type": "source_point",
        "theater": theater,
        "date": date,
        "source": source,
        "source_url": url,
        "source_category": category,
        "confidence": extra.pop("confidence", "source_metadata"),
        "review_status": extra.pop("review_status", "source_registered"),
        "geometry_status": extra.pop("geometry_status", "coverage_centroid"),
        "summary": extra.pop("summary", "Source metadata point; not an event claim."),
    }
    props.update(extra)
    return {"type": "Feature", "properties": props, "geometry": {"type": "Point", "coordinates": [lon, lat]}}

def loc_search(query, rows=100):
    """Fetch LOC search metadata. Some deployments may require user-agent/proxy handling."""
    params = urllib.parse.urlencode({"fo": "json", "q": query, "c": rows})
    req = urllib.request.Request(
        f"https://www.loc.gov/search/?{params}",
        headers={"User-Agent": "AP-WWII-Atlas/0.6 source metadata harvester"},
    )
    with urllib.request.urlopen(req, timeout=30) as response:
        return json.load(response)

def nara_search(query, rows=100):
    """NARA Catalog v2 search. Exact response shapes can evolve; keep raw snapshots."""
    params = urllib.parse.urlencode({"q": query, "rows": rows})
    req = urllib.request.Request(
        f"https://catalog.archives.gov/api/v2/records/search?{params}",
        headers={"User-Agent": "AP-WWII-Atlas/0.6 source metadata harvester"},
    )
    with urllib.request.urlopen(req, timeout=30) as response:
        return json.load(response)

def write_chunks(features, out_dir):
    out_dir.mkdir(parents=True, exist_ok=True)
    for sub in ["sources-by-year", "sources-by-theater", "sources-by-source"]:
        (out_dir / sub).mkdir(exist_ok=True)
        for old in (out_dir / sub).glob("*.geojson"):
            old.unlink()
    groups = {"by_year": defaultdict(list), "by_theater": defaultdict(list), "by_source": defaultdict(list)}
    for feat in features:
        props = feat["properties"]
        groups["by_year"][year_of(props.get("date"))].append(feat)
        groups["by_theater"][slug(props.get("theater"))].append(feat)
        groups["by_source"][slug(props.get("source"))].append(feat)
    manifest = {"version": "generated", "updated": datetime.date.today().isoformat(), "totals": {"source_points": len(features)}, "chunks": {}}
    mapping = {"by_year": "sources-by-year", "by_theater": "sources-by-theater", "by_source": "sources-by-source"}
    for group_name, rel in mapping.items():
        manifest["chunks"][group_name] = []
        for key, rows in sorted(groups[group_name].items()):
            payload = {"type": "FeatureCollection", "features": rows, "metadata": {"chunk_key": key, "records": len(rows)}}
            text = json.dumps(payload, ensure_ascii=False, indent=2)
            path = out_dir / rel / f"{key}.geojson"
            path.write_text(text, encoding="utf-8")
            manifest["chunks"][group_name].append({"key": key, "records": len(rows), "url": f"/data/world-war-ii-atlas/{rel}/{key}.geojson", "sha1": hashlib.sha1(text.encode()).hexdigest()})
    (out_dir / "source-chunk-manifest.json").write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--rechunk", action="store_true", help="Regenerate chunks from ww2_source_points.geojson")
    args = parser.parse_args()
    if args.rechunk:
        source = ROOT / "ww2_source_points.geojson"
        data = json.loads(source.read_text(encoding="utf-8"))
        write_chunks(data["features"], ROOT)
        print(f"rechunked {len(data['features'])} source points")
    else:
        print("Use --rechunk now. LOC/NARA harvest modes should be run deliberately and reviewed before merge.")

if __name__ == "__main__":
    main()
