#!/usr/bin/env python3
"""
IEEE 802 Archive Indexer — VLAN era focus (1994–1998)
Version: 1.3
- FIX: Eliminates f-string in LLM prompt that caused "ValueError: Invalid format specifier" when docs contain braces `{}`.
- Quiet by default: per-file errors are suppressed unless --verbose is set.
- Keeps v1.2 features: robust discovery, never drop rows, WG/author/location/date, WG filter, human sizes, anchors, stats.

Quick start
-----------
# Dry run without LLM
python run.py --root ./ --years 1995 1996 1997 --base-url https://files.serialport.org/ieee802 --out index --llm off --limit 20

# With LLM (summaries+tags)
export OPENAI_API_KEY="sk-..."
python run.py --root ./ --years 1995 1996 1997 --base-url https://files.serialport.org/ieee802 --out index \
  --llm on --llm-endpoint "https://api.openai.com/v1/chat/completions" --llm-model "gpt-4o-mini" \
  --llm-key-env OPENAI_API_KEY --max-workers 4  # add --verbose to see per-file errors
"""

from __future__ import annotations
import argparse
import concurrent.futures
import dataclasses
import html
import json
import os
import re
import subprocess
import sys
import textwrap
import traceback
from pathlib import Path
from typing import List, Optional, Tuple

# put near imports
import json, urllib.request

def llm_healthcheck(endpoint, model, api_key, timeout=20):
    if not api_key:
        print("[llm] disabled: no API key resolved", file=sys.stderr); return False
    body = {
        "model": model,
        "messages": [{"role":"user","content": "Say OK"}],
        "max_tokens": 5,
    }
    try:
        req = urllib.request.Request(
            endpoint,
            data=json.dumps(body).encode("utf-8"),
            headers={"Content-Type":"application/json", "Authorization": f"Bearer {api_key}"}
        )
        with urllib.request.urlopen(req, timeout=timeout) as r:
            data = json.loads(r.read().decode("utf-8"))
        txt = data.get("choices", [{}])[0].get("message", {}).get("content", "")
        ok = isinstance(txt, str) and len(txt) > 0
        print(f"[llm] connectivity: {'OK' if ok else 'FAILED'}", file=sys.stderr)
        return ok
    except Exception as e:
        print(f"[llm] connectivity: FAILED ({e})", file=sys.stderr)
        return False

# ----------------------------
# CLI
# ----------------------------

def parse_args():
    p = argparse.ArgumentParser(description="Index IEEE 802 archives (1994–1998) into static HTML + JSON")
    p.add_argument("--root", required=True, help="Local filesystem root of your ieee802 mirror")
    p.add_argument("--years", nargs="*", type=int, default=[1994,1995,1996,1997,1998], help="Years to include")
    p.add_argument("--out", required=True, help="Output folder for generated HTML/JSON")
    p.add_argument("--base-url", required=True, help="Public/base URL that corresponds to --root for link generation")
    p.add_argument("--llm", choices=["on","off"], default="on", help="Use LLM summaries/tags")
    p.add_argument("--llm-endpoint", default="https://api.openai.com/v1/chat/completions", help="OpenAI‑compatible chat completions endpoint")
    p.add_argument("--llm-model", default="gpt-4o-mini", help="Model name for the API")
    p.add_argument("--llm-key-env", default="OPENAI_API_KEY", help="Env var containing the API key (optional)")
    p.add_argument("--llm-key", default="", help="Provide the API key directly (optional)")
    p.add_argument("--max-workers", type=int, default=os.cpu_count() or 4, help="Parallel workers (auto-capped to 4 when LLM on)")
    p.add_argument("--limit", type=int, default=0, help="Optional: cap number of files per year (0 = no cap)")
    p.add_argument("--timeout", type=int, default=180, help="Seconds: per-file extraction/LLM timeout")
    p.add_argument("--verbose", action="store_true", help="Print per-file errors to stderr")
    return p.parse_args()

# ----------------------------
# Data
# ----------------------------

@dataclasses.dataclass
class DocMeta:
    year: int
    rel_path: str
    file_name: str
    ext: str
    size_bytes: int
    url: str
    title: str = ""
    doc_id: str = ""
    date_str: str = ""
    authors: str = ""
    wg: str = ""
    location: str = ""
    summary: str = ""
    tags: List[str] = dataclasses.field(default_factory=list)

# ----------------------------
# Patterns
# ----------------------------

ID_PAT = re.compile(r"\b(9[4-9]|00|01)/?\d{2,4}\b")
DATE_PAT = re.compile(r"(?i)\b(?:\d{1,2}[\-/]\d{1,2}[\-/](?:19|20)?\d{2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s*(?:19|20)?\d{2})\b")
WG_PAT = re.compile(r"\b802\.(\d+[a-z]?)\b", re.I)
TITLE_PAT = re.compile(r"(?im)^(?:\s*(?:title|subject|document title|doc title)\s*[:\-]\s*)(.+)$")
AUTHOR_HINT_PAT = re.compile(r"(?im)^(?:\s*(?:author(?:s)?|editor(?:s)?|from|contributors?|chair)\s*[:\-]\s*)(.+)$")
LOCATION_PAT = re.compile(r"(?im)\b(?:(meeting|interim|plenary)[^\n,]*,?\s*)([A-Za-z .'-]+,\s*[A-Za-z .'-]+(?:,\s*[A-Za-z .'-]+)?)\b")

TEXT_EXTS = {".txt", ".text"}
PDF_EXTS = {".pdf"}
PS_EXTS = {".ps", ".eps"}
ALL_EXTS = TEXT_EXTS | PDF_EXTS | PS_EXTS

SAFE_HTML = {
    "css": (
        "body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,Roboto;max-width:1100px;margin:2rem auto;padding:0 1rem}"
        "header{display:flex;justify-content:space-between;align-items:center;gap:1rem;flex-wrap:wrap}"
        "h1{font-size:1.6rem;margin:.2rem 0}"
        ".muted{opacity:.7}"
        "input[type=text]{width:360px;max-width:100%;padding:.6rem .8rem;border:1px solid #ddd;border-radius:.6rem}"
        "select{padding:.45rem .6rem;border:1px solid #ddd;border-radius:.6rem}"
        "table{width:100%;border-collapse:collapse;margin-top:1rem}"
        "th,td{border-bottom:1px solid #eee;text-align:left;padding:.6rem .4rem;vertical-align:top}"
        "th{font-weight:600}"
        ".tag{display:inline-block;font-size:.8rem;border:1px solid #ddd;border-radius:999px;padding:.1rem .5rem;margin:.1rem}"
        ".small{font-size:.85rem}"
        ".kicker{font-size:.9rem;color:#444}"
        ".dim{color:#666}"
        ".pill{background:#f5f5f5;border-radius:999px;padding:.15rem .5rem}"
        "footer{margin:2rem 0;font-size:.85rem;color:#666}"
        ".count{font-weight:600}"
        ".noresults{padding:1rem 0;color:#666}"
        ".docid{font-variant-numeric:tabular-nums}"
        ".sum{white-space:pre-wrap}"
        ".nowrap{white-space:nowrap}"
        ".wg{background:#eef7ff;border:1px solid #cfe6ff}"
        ".file{opacity:.7;font-size:.8rem}"
        "a.anchor{color:#aaa;text-decoration:none;margin-left:.3rem}"
    ),
    "js": (
        "const q=document.getElementById('q');const wg=document.getElementById('wg');"
        "const rows=Array.from(document.querySelectorAll('tbody tr'));const count=document.getElementById('count');"
        "function norm(s){return (s||'').toLowerCase();}"
        "function filter(){const terms=norm(q.value).split(/\\s+/).filter(Boolean);const wgVal=wg.value;let visible=0;"
        "rows.forEach(tr=>{const hay=norm(tr.dataset.hay);const okText=terms.every(t=>hay.includes(t));"
        "const okWg=wgVal===''||(tr.dataset.wg||'')===wgVal;const ok=okText&&okWg;tr.style.display=ok?'':'none';if(ok)visible++;});"
        "count.textContent=visible;document.getElementById('noresults').style.display=visible?'none':'block';}"
        "q.addEventListener('input',filter);wg.addEventListener('change',filter);filter();"
    )
}

# ----------------------------
# Utilities
# ----------------------------

def run_cmd(cmd: List[str], timeout: int) -> Tuple[int, bytes, bytes]:
    try:
        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
        return p.returncode, p.stdout, p.stderr
    except subprocess.TimeoutExpired:
        return 124, b"", b"timeout"


def extract_text(path: Path, timeout: int) -> str:
    ext = path.suffix.lower()
    if ext in TEXT_EXTS:
        try:
            data = path.read_bytes()
            return data[:4_000_000].decode(errors='replace')
        except Exception:
            return ""
    if ext in PDF_EXTS:
        code, out, _ = run_cmd(["pdftotext", "-q", str(path), "-"], timeout)
        return out.decode(errors='replace') if code == 0 else ""
    if ext in PS_EXTS:
        tmp_pdf = path.with_suffix('.__tmp__.pdf')
        code, _, _ = run_cmd(["ps2pdf", str(path), str(tmp_pdf)], timeout)
        if code != 0 or not tmp_pdf.exists():
            try: tmp_pdf.unlink()
            except Exception: pass
            return ""
        code2, out2, _ = run_cmd(["pdftotext", "-q", str(tmp_pdf), "-"], timeout)
        try: tmp_pdf.unlink()
        except Exception: pass
        return out2.decode(errors='replace') if code2 == 0 else ""
    return ""


def human_bytes(n: int) -> str:
    units = ["bytes","KB","MB","GB","TB"]
    i = 0
    x = float(max(0, n))
    while x >= 1024 and i < len(units)-1:
        x /= 1024
        i += 1
    return f"{x:.0f} {units[i]}" if i==0 else f"{x:.1f} {units[i]}"

# ----------------------------
# Heuristics
# ----------------------------

def sniff_meta_from_text(txt: str) -> tuple[str,str,str,str,str,str]:
    head = txt[:48_000] if txt else ""
    title = authors = date_str = doc_id = wg = location = ""

    m = TITLE_PAT.search(head)
    if m: title = m.group(1).strip()

    m = AUTHOR_HINT_PAT.search(head)
    if m: authors = m.group(1).strip()

    m = ID_PAT.search(head)
    if m: doc_id = m.group(0).replace('-', '/')

    wgs = WG_PAT.findall(head)
    if wgs: wg = f"802.{wgs[0]}".upper()

    m = DATE_PAT.search(head)
    if m: date_str = m.group(0)

    m = LOCATION_PAT.search(head)
    if m: location = m.group(2).strip()

    if not title:
        for line in head.splitlines():
            s = line.strip()
            if s and len(s) > 6 and not s.lower().startswith(("page ", "ieee")):
                title = s[:200]
                break

    return title, authors, date_str, doc_id, wg, location

# ----------------------------
# LLM (safe prompt construction)
# ----------------------------

def resolve_api_key(env_name: str, direct: str) -> str:
    return direct or os.getenv(env_name, "")


def llm_summarize(endpoint: str, model: str, api_key: str, content: str, timeout: int) -> tuple[str, List[str]]:
    import urllib.request

    header = (
        "You are indexing historical IEEE 802 documents related to VLANs (802.1Q) during 1994–1998.\n"
        "1) Provide a crisp 2–4 sentence abstract emphasizing VLAN‑relevant contributions (requirements, terminology, tagging proposals, 802.1p/Q drafts, debates, interoperability, multicast/broadcast handling, bridging interactions, scope).\n"
        "2) Return 5–10 lowercase tags (e.g., vlan, 802.1q, 802.1p, bridging, tagging, frame format, encapsulation, priority, multicast, management, par, conformance, requirements).\n"
        "Output JSON: {\"summary\": str, \"tags\": [str, ...]}.\n\n"
        "Document excerpt (may be partial):\n\n"
    )
    # IMPORTANT: no f-string interpolation; just concatenate strings
    prompt = header + (content[:8000] if content else "")

    body = {
        "model": model,
        "messages": [
            {"role":"system","content":"You create archival‑quality indexes for IEEE 802 documents."},
            {"role":"user","content": prompt}
        ],
        "temperature": 0.2,
        "max_tokens": 320
    }
    req = urllib.request.Request(endpoint, data=json.dumps(body).encode("utf-8"), headers={
        "Content-Type":"application/json",
        "Authorization":f"Bearer {api_key}"
    })
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:
            resp = json.loads(r.read().decode("utf-8"))
        msg = resp.get("choices", [{}])[0].get("message", {}).get("content", "")
        try:
            data = json.loads(msg)
            summary = (data.get("summary","") or "").strip()
            tags = [t for t in (data.get("tags") or []) if isinstance(t, str)]
            return summary, tags[:10]
        except Exception:
            # If model returns plain text, keep it as summary
            return msg.strip(), []
    except Exception:
        # Swallow network/API errors quietly; return empty
        return "", []

# ----------------------------
# Discovery & register parsing
# ----------------------------

def discover_files(root: Path, year: int, limit: int = 0) -> List[Path]:
    found: List[Path] = []

    # 1) Prefer scanning docs{YYYY} and docs{YYYY}a
    for dirname in (f"docs{year}", f"docs{year}a"):
        base = root / dirname
        if base.exists():
            for p in base.rglob("*"):
                if p.is_file() and p.suffix.lower() in ALL_EXTS:
                    found.append(p)

    # 2) Fallback: global path match for "/docsYYYY[a]?/"
    if not found:
        pat = re.compile(rf"(^|/)docs{year}[a-z]?(/|$)", re.I)
        for p in root.rglob("*"):
            if p.is_file() and p.suffix.lower() in ALL_EXTS:
                rel = p.relative_to(root).as_posix()
                if pat.search(rel):
                    found.append(p)

    # 3) Last resort: accept dYY segment (e.g., d95) anywhere in path
    if not found:
        yy = str(year)[-2:]
        pat2 = re.compile(rf"(^|/)d{yy}(/|_|-)", re.I)
        for p in root.rglob("*"):
            if p.is_file() and p.suffix.lower() in ALL_EXTS:
                rel = p.relative_to(root).as_posix()
                if pat2.search(rel):
                    found.append(p)

    found.sort()
    if limit > 0:
        found = found[:limit]

    # DEBUG
    print(f"[discover] {year}: {len(found)} files", file=sys.stderr)
    for sample in found[:5]:
        print(f"[discover] {year} sample: {sample.relative_to(root).as_posix()}", file=sys.stderr)

    return found


def load_register_texts(root: Path, year: int) -> List[str]:
    texts: List[str] = []
    for name in (f"index{str(year)[-2:]}.txt", "index.txt", "register.txt", f"{year} register.txt"):
        for p in root.rglob(name):
            rel = p.relative_to(root).as_posix()
            if str(year) not in rel:
                continue
            try:
                texts.append(p.read_text(errors='replace'))
            except Exception:
                pass
    return texts


def refine_from_register(register_blob: str, filename_noext: str, title: str, doc_id: str, wg: str) -> tuple[str,str,str]:
    if not register_blob:
        return title, doc_id, wg
    pat = re.compile(re.escape(filename_noext), re.I)
    for line in register_blob.splitlines():
        if not pat.search(line):
            continue
        m_id = ID_PAT.search(line)
        if m_id:
            doc_id = doc_id or m_id.group(0).replace('-', '/')
        m_wg = WG_PAT.search(line)
        if m_wg:
            wg = wg or f"802.{m_wg.group(1)}".upper()
        parts = re.split(r"\s[–—-]\s", line, maxsplit=1)
        if len(parts) == 2 and len(parts[1].strip()) > 6:
            title = title or parts[1].strip()
    return title, doc_id, wg

# ----------------------------
# Build
# ----------------------------

def build_per_year_index(args) -> None:
    root = Path(args.root)
    out = Path(args.out)
    out.mkdir(parents=True, exist_ok=True)

    all_year_pages: List[tuple[int,str]] = []

    for year in args.years:
        print(f"\n[Year {year}] Scanning…", file=sys.stderr)
        files = discover_files(root, year, args.limit)
        register_blob = "\n\n".join(load_register_texts(root, year))

        items: List[DocMeta] = []

        def process_file(p: Path) -> DocMeta:
            rel = p.relative_to(root).as_posix()
            url = f"{args.base_url.rstrip('/')}/{rel}"
            size = p.stat().st_size
            ext = p.suffix.lower()
            title=authors=date_str=doc_id=wg=location=summary=""
            tags: List[str] = []
            try:
                text = extract_text(p, args.timeout)
                t,a,d,i,w,l = sniff_meta_from_text(text)
                title, authors, date_str, doc_id, wg, location = t,a,d,i,w,l
                base_noext = re.sub(r"\.[^.]+$", "", p.name)
                if register_blob:
                    title, doc_id, wg = refine_from_register(register_blob, base_noext, title, doc_id, wg)
                if args.llm == "on":
                    api_key = resolve_api_key(args.llm_key_env, args.llm_key)
                    if api_key and text:
                        s, tg = llm_summarize(args.llm_endpoint, args.llm_model, api_key, text, args.timeout)
                        summary, tags = s, tg
            except Exception as e:
                if args.verbose:
                    print(f"[error] {year} {rel}: {e}", file=sys.stderr)
                    traceback.print_exc(file=sys.stderr)
            return DocMeta(
                year=year, rel_path=rel, file_name=p.name, ext=ext, size_bytes=size, url=url,
                title=title or p.name, doc_id=doc_id, date_str=date_str, authors=authors,
                wg=wg, location=location, summary=summary, tags=tags
            )

        workers = min(args.max_workers, 4) if args.llm == "on" else args.max_workers
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
            for meta in ex.map(process_file, files):
                items.append(meta)

        # Sort: prioritize 802.1, then by doc id if present, else filename
        def sort_key(m: DocMeta):
            wg_rank = 0 if (m.wg or '').upper().startswith('802.1') else 1
            m_id = re.match(r"(\d{2})/(\d{2,4})", m.doc_id or '')
            if m_id:
                return (wg_rank, int(m_id.group(1)), int(m_id.group(2)))
            return (wg_rank, 99, m.file_name.lower())
        items.sort(key=sort_key)

        summarized = sum(1 for x in items if x.summary)
        print(f"[stats] {year}: items={len(items)} summarized={summarized}", file=sys.stderr)

        year_dir = out / str(year)
        year_dir.mkdir(parents=True, exist_ok=True)
        (year_dir/"index.json").write_text(json.dumps([dataclasses.asdict(x) for x in items], ensure_ascii=False, indent=2), encoding='utf-8')
        (year_dir/"index.html").write_text(render_year_html(year, items, args.base_url), encoding='utf-8')

        all_year_pages.append((year, f"{year}/index.html"))
        print(f"[Year {year}] Wrote {year_dir/'index.html'} (items: {len(items)})", file=sys.stderr)

    (out/"index.html").write_text(render_root_html(all_year_pages), encoding='utf-8')
    print(f"\nDone. Root: {out/'index.html'}", file=sys.stderr)

# ----------------------------
# HTML
# ----------------------------

def esc(s: str) -> str:
    return html.escape(s or "")


def render_year_html(year: int, items: List[DocMeta], base_url: str) -> str:
    wgs = sorted({(m.wg or '').upper() for m in items if m.wg})
    wg_options = "<option value=\"\">All WGs</option>" + "".join(f"<option value=\"{esc(w)}\">{esc(w)}</option>" for w in wgs)

    rows = []
    for m in items:
        hay = " ".join([m.file_name, m.rel_path, m.title, m.doc_id, m.date_str, m.authors, m.summary, m.location, m.wg, " ".join(m.tags)])
        tags_html = "".join(f"<span class=tag>{esc(t)}</span>" for t in (m.tags or []))
        title = esc(m.title) if m.title else esc(m.file_name)
        anchor = esc((m.doc_id or m.file_name).replace(' ', '_'))
        wg_badge = f"<span class=pill wg>{esc(m.wg)}</span>" if m.wg else ""
        meta_bits = " ".join(x for x in [esc(m.date_str), esc(m.location)] if x)
        size_str = esc(human_bytes(m.size_bytes))
        filetype = esc(m.ext)
        rows.append(f"""
        <tr data-hay="{esc(hay)}" data-wg="{esc((m.wg or '').upper())}" id="{anchor}">
          <td class=nowrap>
            <span class="pill docid">{esc(m.doc_id) if m.doc_id else ''}</span>
            {wg_badge}
          </td>
          <td>
            <div><a href="{esc(m.url)}" target="_blank" rel="noopener">{title}</a>
              <a class=anchor href="#{anchor}" title="Copy link to this row">#</a>
              <span class=file>{filetype}</span>
            </div>
            <div class="sum dim small">{esc(m.summary)}</div>
            <div>{tags_html}</div>
          </td>
          <td class=small>
            <div class=dim>{meta_bits}</div>
            <div class=dim>{esc(m.authors)}</div>
          </td>
          <td class=small>{size_str}</td>
        </tr>
        """)

    table = "\n".join(rows) if rows else "<tr><td colspan=4 class=noresults>No documents found for this year.</td></tr>"

    return f"""
<!doctype html>
<html lang=en>
<meta charset=utf-8>
<meta name=viewport content="width=device-width, initial-scale=1">
<title>IEEE 802 Archive Index — {year}</title>
<style>{SAFE_HTML['css']}</style>
<header>
  <h1>IEEE 802 Archive Index <span class=muted>— {year}</span></h1>
  <div style="display:flex;gap:.5rem;align-items:center;flex-wrap:wrap">
    <input id=q type=text placeholder="Filter by keyword, tag, doc id, author…" aria-label="Search">
    <select id=wg aria-label="Working Group filter">{wg_options}</select>
  </div>
</header>
<div class=kicker>Files mirror: <code>{esc(base_url)}</code> • Showing <span id=count class=count>{len(items)}</span> documents</div>
<table>
  <thead>
    <tr><th>ID / WG</th><th>Title &amp; Summary</th><th>Meta</th><th>Size</th></tr>
  </thead>
  <tbody>
    {table}
  </tbody>
</table>
<div id=noresults class=noresults style="display:none">No matching documents.</div>
<footer>
  Generated by <code>run.py</code>. This page is static; search is client‑side.</footer>
<script>{SAFE_HTML['js']}</script>
</html>
"""


def render_root_html(year_pages: List[tuple[int,str]]) -> str:
    items = "".join(f"<li><a href=\"{esc(rel)}\">{year}</a></li>" for (year, rel) in sorted(year_pages))
    return f"""
<!doctype html>
<html lang=en>
<meta charset=utf-8>
<meta name=viewport content="width=device-width, initial-scale=1">
<title>IEEE 802 Archive Index — 1994–1998</title>
<style>{SAFE_HTML['css']}</style>
<header>
  <h1>IEEE 802 Archive Index <span class=muted>— 1994–1998</span></h1>
</header>
<p>Select a year:</p>
<ul>
  {items}
</ul>
<footer class=dim>Focus: VLAN history (802.1Q) documents circa 1994–1998.</footer>
</html>
"""

# ----------------------------
# Main
# ----------------------------

def main():
    args = parse_args()
    if args.llm == "on":
        api_key = (args.llm_key or os.getenv(args.llm_key_env, ""))
        llm_healthcheck(args.llm_endpoint, args.llm_model, api_key)
    build_per_year_index(args)


if __name__ == "__main__":
    def build_per_year_index(args) -> None:
        root = Path(args.root)
        out = Path(args.out)
        out.mkdir(parents=True, exist_ok=True)

        all_year_pages: List[tuple[int,str]] = []

        for year in args.years:
            print(f"\n[Year {year}] Scanning…", file=sys.stderr)
            files = discover_files(root, year, args.limit)
            register_blob = "\n\n".join(load_register_texts(root, year))

            items: List[DocMeta] = []

            def process_file(p: Path) -> DocMeta:
                rel = p.relative_to(root).as_posix()
                url = f"{args.base_url.rstrip('/')}/{rel}"
                size = p.stat().st_size
                ext = p.suffix.lower()
                title=authors=date_str=doc_id=wg=location=summary=""
                tags: List[str] = []
                try:
                    text = extract_text(p, args.timeout)
                    t,a,d,i,w,l = sniff_meta_from_text(text)
                    title, authors, date_str, doc_id, wg, location = t,a,d,i,w,l
                    base_noext = re.sub(r"\.[^.]+$", "", p.name)
                    if register_blob:
                        title, doc_id, wg = refine_from_register(register_blob, base_noext, title, doc_id, wg)
                    if args.llm == "on":
                        api_key = resolve_api_key(args.llm_key_env, args.llm_key)
                        if api_key and text:
                            s, tg = llm_summarize(args.llm_endpoint, args.llm_model, api_key, text, args.timeout)
                            summary, tags = s, tg
                except Exception as e:
                    if args.verbose:
                        print(f"[error] {year} {rel}: {e}", file=sys.stderr)
                        traceback.print_exc(file=sys.stderr)
                return DocMeta(
                    year=year, rel_path=rel, file_name=p.name, ext=ext, size_bytes=size, url=url,
                    title=title or p.name, doc_id=doc_id, date_str=date_str, authors=authors,
                    wg=wg, location=location, summary=summary, tags=tags
                )

            workers = min(args.max_workers, 4) if args.llm == "on" else args.max_workers
            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
                for meta in ex.map(process_file, files):
                    items.append(meta)

            def sort_key(m: DocMeta):
                wg_rank = 0 if (m.wg or '').upper().startswith('802.1') else 1
                m_id = re.match(r"(\d{2})/(\d{2,4})", m.doc_id or '')
                if m_id:
                    return (wg_rank, int(m_id.group(1)), int(m_id.group(2)))
                return (wg_rank, 99, m.file_name.lower())
            items.sort(key=sort_key)

            summarized = sum(1 for x in items if x.summary)
            print(f"[stats] {year}: items={len(items)} summarized={summarized}", file=sys.stderr)

            year_dir = out / str(year)
            year_dir.mkdir(parents=True, exist_ok=True)
            (year_dir/"index.json").write_text(json.dumps([dataclasses.asdict(x) for x in items], ensure_ascii=False, indent=2), encoding='utf-8')
            (year_dir/"index.html").write_text(render_year_html(year, items, args.base_url), encoding='utf-8')

            all_year_pages.append((year, f"{year}/index.html"))
            print(f"[Year {year}] Wrote {year_dir/'index.html'} (items: {len(items)})", file=sys.stderr)

        (out/"index.html").write_text(render_root_html(all_year_pages), encoding='utf-8')
        print(f"\nDone. Root: {out/'index.html'}", file=sys.stderr)

    main()

