#!/usr/bin/env python3
"""
IEEE 802 Archive Indexer (focused on VLAN history 1994–1998)

What it does
------------
• Recursively scans local folders for years 1994–1998 (default) that mirror your `https://files.serialport.org/ieee802/` tree.
• Extracts text from .txt, .pdf (via pdftotext), and .ps (via ps2pdf → pdftotext).
• Heuristically derives title/author/date/ID from the first chunk of text.
• Optionally calls an OpenAI‑compatible API to generate a 2–4 sentence summary and tags.
• Emits per‑year static HTML index files (with a client‑side search box), plus a top‑level `index.html`.
• Also writes a machine‑readable `index.json` per year for future search engines.

Usage examples
--------------
python ieee802_indexer.py \
  --root "/data/mirrors/ieee802" \
  --years 1994 1995 1996 1997 1998 \
  --base-url "https://files.serialport.org/ieee802" \
  --out "/data/mirrors/ieee802-index" \
  --llm on \
  --llm-endpoint "https://api.openai.com/v1/chat/completions" \
  --llm-model "gpt-4o-mini" \
  --llm-key-env OPENAI_API_KEY

If you want to skip LLM summaries for now, set `--llm off`.

Requirements
------------
• Python 3.9+
• External tools on PATH: `pdftotext` (Poppler), `ps2pdf` (Ghostscript)
• (Optional) Network access + an API key if `--llm on`

Notes
-----
• We favor any per‑year "register/index" text files we find (e.g., index95.txt) when extracting titles/IDs.
• The HTML is self‑contained, no external JS/CDN.
"""

from __future__ import annotations
import argparse
import concurrent.futures
import dataclasses
import html
import io
import json
import os
import re
import shutil
import subprocess
import sys
import textwrap
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple

# ----------------------------
# Config & CLI
# ----------------------------

def parse_args():
    p = argparse.ArgumentParser(description="Index IEEE 802 archives (1994–1998) into static HTML + JSON")
    p.add_argument("--root", required=True, help="Local filesystem root of your ieee802 mirror (e.g., /data/ieee802)")
    p.add_argument("--years", nargs="*", type=int, default=[1994,1995,1996,1997,1998], help="Years to include")
    p.add_argument("--out", required=True, help="Output folder for generated HTML/JSON")
    p.add_argument("--base-url", required=True, help="Public/base URL that corresponds to --root for link generation")
    p.add_argument("--llm", choices=["on","off"], default="on", help="Use LLM summaries/tags")
    p.add_argument("--llm-endpoint", default="https://api.openai.com/v1/chat/completions", help="OpenAI‑compatible chat completions endpoint")
    p.add_argument("--llm-model", default="gpt-4o-mini", help="Model name for the API")
    p.add_argument("--llm-key-env", default="OPENAI_API_KEY", help="Env var containing the API key")
    p.add_argument("--max-workers", type=int, default=os.cpu_count() or 4, help="Parallel workers for extraction")
    p.add_argument("--limit", type=int, default=0, help="Optional: limit number of files per year (0 = no limit)")
    p.add_argument("--timeout", type=int, default=180, help="Seconds: per-file extraction/LLM timeout")
    return p.parse_args()

# ----------------------------
# Data structures
# ----------------------------

@dataclasses.dataclass
class DocMeta:
    year: int
    rel_path: str        # path relative to root
    file_name: str
    ext: str
    size_bytes: int
    url: str             # absolute URL using base-url
    title: str = ""
    doc_id: str = ""      # e.g., 95/055
    date_str: str = ""     # best-effort human date
    authors: str = ""
    summary: str = ""
    tags: List[str] = dataclasses.field(default_factory=list)

# ----------------------------
# Helpers
# ----------------------------

APACHE_LISTING_INDEXES = {"index.txt", "index95.txt", "readme.txt", "document register", "document register 1995", "register"}

ID_PAT = re.compile(r"\b(9[4-8])[/-]\d{2,4}\b")
DATE_PAT = re.compile(r"(?i)\b(?:\d{1,2}[\-/]\d{1,2}[\-/](?:19)?\d{2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s*(?:19)?\d{2})\b")
TITLE_PAT = re.compile(r"(?im)^(?:\s*(?:title|subject|document title|doc title)\s*[:\-]\s*)(.+)$")
AUTHOR_PAT = re.compile(r"(?im)^(?:\s*(?:author|authors|contributors?)\s*[:\-]\s*)(.+)$")

TEXT_EXTS = {".txt", ".text"}
PDF_EXTS = {".pdf"}
PS_EXTS = {".ps", ".eps"}

SAFE_HTML = {
    "css": """
    body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,Roboto;max-width:1100px;margin:2rem auto;padding:0 1rem}
    header{display:flex;justify-content:space-between;align-items:center;gap:1rem;flex-wrap:wrap}
    h1{font-size:1.6rem;margin:.2rem 0}
    .muted{opacity:.7}
    input[type=text]{width:360px;max-width:100%;padding:.6rem .8rem;border:1px solid #ddd;border-radius:.6rem}
    table{width:100%;border-collapse:collapse;margin-top:1rem}
    th,td{border-bottom:1px solid #eee;text-align:left;padding:.6rem .4rem;vertical-align:top}
    th{font-weight:600}
    .tag{display:inline-block;font-size:.8rem;border:1px solid #ddd;border-radius:999px;padding:.1rem .5rem;margin:.1rem}
    .small{font-size:.85rem}
    .kicker{font-size:.9rem;color:#444}
    .dim{color:#666}
    .pill{background:#f5f5f5;border-radius:999px;padding:.15rem .5rem}
    footer{margin:2rem 0;font-size:.85rem;color:#666}
    .count{font-weight:600}
    .noresults{padding:1rem 0;color:#666}
    .docid{font-variant-numeric:tabular-nums}
    .sum{white-space:pre-wrap}
    .nowrap{white-space:nowrap}
    """,
    "js": """
    const q = document.getElementById('q');
    const rows = Array.from(document.querySelectorAll('tbody tr'));
    const count = document.getElementById('count');
    function norm(s){return (s||'').toLowerCase();}
    function filter(){
      const terms = norm(q.value).split(/\s+/).filter(Boolean);
      let visible=0;
      rows.forEach(tr=>{
        const hay = norm(tr.dataset.hay);
        const ok = terms.every(t=>hay.includes(t));
        tr.style.display = ok ? '' : 'none';
        if(ok) visible++;
      });
      count.textContent = visible;
      document.getElementById('noresults').style.display = visible? 'none':'block';
    }
    q.addEventListener('input', filter);
    filter();
    """
}

# ----------------------------
# Extraction
# ----------------------------

def run_cmd(cmd: List[str], timeout: int) -> Tuple[int, bytes, bytes]:
    try:
        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
        return p.returncode, p.stdout, p.stderr
    except subprocess.TimeoutExpired:
        return 124, b"", b"timeout"


def extract_text(path: Path, timeout: int) -> str:
    ext = path.suffix.lower()
    if ext in TEXT_EXTS:
        try:
            with open(path, 'rb') as f:
                data = f.read(2_000_000)  # read up to ~2MB
            return data.decode(errors='replace')
        except Exception:
            return ""
    elif ext in PDF_EXTS:
        code, out, err = run_cmd(["pdftotext", "-q", str(path), "-"], timeout)
        return out.decode(errors='replace') if code == 0 else ""
    elif ext in PS_EXTS:
        # ps2pdf to temp, then pdftotext
        tmp_pdf = path.with_suffix('.__tmp__.pdf')
        code, out, err = run_cmd(["ps2pdf", str(path), str(tmp_pdf)], timeout)
        if code != 0 or not tmp_pdf.exists():
            if tmp_pdf.exists():
                try: tmp_pdf.unlink()
                except Exception: pass
            return ""
        code2, out2, err2 = run_cmd(["pdftotext", "-q", str(tmp_pdf), "-"], timeout)
        try: tmp_pdf.unlink()
        except Exception: pass
        return out2.decode(errors='replace') if code2 == 0 else ""
    else:
        return ""


def sniff_meta_from_text(txt: str) -> Tuple[str,str,str,str]:
    head = txt[:32_000] if txt else ""
    title = ""
    authors = ""
    date_str = ""
    doc_id = ""

    m = TITLE_PAT.search(head)
    if m:
        title = m.group(1).strip()

    m = AUTHOR_PAT.search(head)
    if m:
        authors = m.group(1).strip()

    m = ID_PAT.search(head)
    if m:
        y = m.group(1)
        # normalize X/YYY format as YY/NNN
        doc_id = m.group(0).replace("-","/")

    m = DATE_PAT.search(head)
    if m:
        date_str = m.group(0)

    # If no explicit title, take first non-empty line as a fallback (trimmed)
    if not title:
        for line in head.splitlines():
            s = line.strip()
            if s and len(s) > 6 and not s.lower().startswith(("page ", "ieee")):
                title = s[:200]
                break

    return title, authors, date_str, doc_id

# ----------------------------
# Optional LLM Summaries
# ----------------------------

def llm_summarize(endpoint: str, model: str, api_key: str, content: str, timeout: int) -> Tuple[str, List[str]]:
    """Call an OpenAI‑compatible Chat Completions endpoint to summarize and tag a document chunk.
    Returns (summary, tags).
    """
    import json, urllib.request
    prompt = textwrap.dedent(f"""
    You are indexing historical IEEE 802 documents related to the emergence of VLANs (802.1Q) circa 1994–1998.

    TASK:
    1) Provide a crisp 2–4 sentence abstract focusing on VLAN‑relevant contributions (requirements, terminology, tagging proposals, 802.1p/Q drafts, debates, interoperability, multicast/broadcast handling, bridging interactions, scope).
    2) Return 5–10 short tags (lowercase) like: vlan, 802.1q, 802.1p, bridging, tagging, frame format, encapsulation, priority, multicast, management, par, conformance, requirements.

    Output JSON with keys: summary, tags (array). Keep it neutral and archival in tone.

    Document excerpt (may be partial):\n\n{content[:8000]}
    """)
    body = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You create archival‑quality indexes for IEEE 802 documents."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2,
        "max_tokens": 300
    }
    req = urllib.request.Request(endpoint, data=json.dumps(body).encode('utf-8'), headers={
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
    })
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:
            resp = json.loads(r.read().decode('utf-8'))
        # OpenAI‑style response
        msg = resp.get('choices', [{}])[0].get('message', {}).get('content', '')
        # Try to parse JSON content from the assistant
        try:
            data = json.loads(msg)
            summary = data.get('summary','').strip()
            tags = [t.strip() for t in data.get('tags', []) if isinstance(t, str)]
            return summary, tags
        except Exception:
            # Fallback: plain text, best effort
            return msg.strip(), []
    except Exception as e:
        return "", []

# ----------------------------
# Scanner
# ----------------------------

def discover_files(root: Path, year: int, limit: int = 0) -> List[Path]:
    candidates: List[Path] = []
    if not root.exists():
        return candidates
    for p in root.rglob('*'):
        try:
            if not p.is_file():
                continue
            if p.suffix.lower() in TEXT_EXTS | PDF_EXTS | PS_EXTS:
                # only pick files that contain the year in their relative path
                rel = p.relative_to(root).as_posix()
                if str(year) in rel:
                    candidates.append(p)
        except Exception:
            continue
    candidates.sort()
    if limit > 0:
        candidates = candidates[:limit]
    return candidates


def load_register_texts(root: Path, year: int) -> List[str]:
    texts = []
    for name in (f"index{str(year)[-2:]}.txt", "index.txt", "register.txt", f"{year} register.txt"):
        for p in root.rglob(name):
            # Only consider files under this year
            rel = p.relative_to(root).as_posix()
            if str(year) not in rel:
                continue
            try:
                with open(p, 'r', errors='replace') as f:
                    texts.append(f.read())
            except Exception:
                pass
    return texts


def build_per_year_index(args) -> None:
    root = Path(args.root)
    out = Path(args.out)
    out.mkdir(parents=True, exist_ok=True)

    all_year_pages = []

    for year in args.years:
        print(f"\n[Year {year}] Scanning…", file=sys.stderr)
        files = discover_files(root, year, args.limit)
        registers = load_register_texts(root, year)
        register_blob = "\n\n".join(registers)

        items: List[DocMeta] = []

        def process_file(p: Path) -> Optional[DocMeta]:
            try:
                rel = p.relative_to(root).as_posix()
                url = f"{args.base_url.rstrip('/')}/{rel}"
                size = p.stat().st_size
                text = extract_text(p, args.timeout)

                # Seed meta
                title, authors, date_str, doc_id = sniff_meta_from_text(text)

                # If we have a register blob, try to refine title/doc_id via filename hints
                base = p.name
                base_noext = re.sub(r"\.[^.]+$", "", base)
                if register_blob:
                    # look for lines containing base name or probable ID
                    pattern = re.compile(re.escape(base_noext), re.I)
                    for line in register_blob.splitlines():
                        if pattern.search(line):
                            # naive title split: "95/055 – Title …"
                            m_id = ID_PAT.search(line)
                            if m_id:
                                doc_id = doc_id or m_id.group(0).replace('-', '/')
                            # capture after dash
                            m_t = re.split(r"\s[–—-]\s", line, maxsplit=1)
                            if len(m_t) == 2 and len(m_t[1].strip()) > 6:
                                title = title or m_t[1].strip()

                meta = DocMeta(
                    year=year,
                    rel_path=rel,
                    file_name=p.name,
                    ext=p.suffix.lower(),
                    size_bytes=size,
                    url=url,
                    title=title,
                    doc_id=doc_id,
                    date_str=date_str,
                    authors=authors
                )

                # LLM summarize if enabled
                if args.llm == "on":
                    api_key = os.getenv(args.llm_key_env, "")
                    if api_key and text:
                        summary, tags = llm_summarize(args.llm_endpoint, args.llm_model, api_key, text, args.timeout)
                        meta.summary = summary
                        meta.tags = tags[:10]
                return meta
            except Exception as e:
                return None

        with concurrent.futures.ThreadPoolExecutor(max_workers=args.max_workers) as ex:
            for meta in ex.map(process_file, files):
                if meta:
                    items.append(meta)

        # Sort: by doc_id if present, else filename
        def sort_key(m: DocMeta):
            # Attempt to sort like 95/055 → (95, 55)
            m_id = re.match(r"(\d{2})/(\d{2,4})", m.doc_id)
            if m_id:
                return (int(m_id.group(1)), int(m_id.group(2)))
            return (99, m.file_name)
        items.sort(key=sort_key)

        # Write JSON
        year_dir = out / str(year)
        year_dir.mkdir(parents=True, exist_ok=True)
        json_path = year_dir / "index.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump([dataclasses.asdict(x) for x in items], f, ensure_ascii=False, indent=2)

        # Write HTML
        html_path = year_dir / "index.html"
        with open(html_path, 'w', encoding='utf-8') as f:
            f.write(render_year_html(year, items, base_url=args.base_url))

        all_year_pages.append((year, f"{str(year)}/index.html"))
        print(f"[Year {year}] Wrote {html_path} (items: {len(items)})", file=sys.stderr)

    # Root HTML
    root_html = Path(args.out) / "index.html"
    with open(root_html, 'w', encoding='utf-8') as f:
        f.write(render_root_html(all_year_pages))
    print(f"\nDone. Root: {root_html}", file=sys.stderr)

# ----------------------------
# HTML Rendering
# ----------------------------

def esc(s: str) -> str:
    return html.escape(s or "")


def render_year_html(year: int, items: List[DocMeta], base_url: str) -> str:
    rows = []
    for m in items:
        hay = " ".join([
            m.file_name, m.rel_path, m.title, m.doc_id, m.date_str, m.authors, m.summary, " ".join(m.tags)
        ])
        tags_html = "".join(f"<span class=tag>{esc(t)}</span>" for t in (m.tags or []))
        title = esc(m.title) if m.title else esc(m.file_name)
        rows.append(f"""
        <tr data-hay="{esc(hay)}">
          <td class=nowrap><span class="pill docid">{esc(m.doc_id) if m.doc_id else ''}</span></td>
          <td>
            <div><a href="{esc(m.url)}" target="_blank" rel="noopener">{title}</a></div>
            <div class="sum dim small">{esc(m.summary)}</div>
            <div>{tags_html}</div>
          </td>
          <td class=small>
            <div class=dim>{esc(m.date_str)}</div>
            <div class=dim>{esc(m.authors)}</div>
          </td>
          <td class=small>{m.size_bytes:,}</td>
          <td class=small>{esc(m.ext)}</td>
        </tr>
        """)

    table = "\n".join(rows) if rows else "<tr><td colspan=5 class=noresults>No documents found for this year.</td></tr>"

    return f"""
<!doctype html>
<html lang=en>
<meta charset=utf-8>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>IEEE 802 Archive Index — {year}</title>
<style>{SAFE_HTML['css']}</style>
<header>
  <h1>IEEE 802 Archive Index <span class=muted>— {year}</span></h1>
  <div>
    <input id=q type=text placeholder="Filter by keyword, tag, doc id, author…" aria-label="Search">
  </div>
</header>
<div class=kicker>Files mirror: <code>{esc(base_url)}</code> • Showing <span id=count class=count>{len(items)}</span> documents</div>
<table>
  <thead>
    <tr><th>ID</th><th>Title &amp; Summary</th><th>Meta</th><th>Bytes</th><th>Type</th></tr>
  </thead>
  <tbody>
    {table}
  </tbody>
</table>
<div id=noresults class=noresults style="display:none">No matching documents.</div>
<footer>
  Generated by <code>ieee802_indexer.py</code>. This page is static; search is client‑side.
</footer>
<script>{SAFE_HTML['js']}</script>
</html>
"""


def render_root_html(year_pages: List[Tuple[int,str]]) -> str:
    items = "".join(
        f"<li><a href=\"{esc(rel)}\">{year}</a></li>" for (year, rel) in sorted(year_pages)
    )
    return f"""
<!doctype html>
<html lang=en>
<meta charset=utf-8>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>IEEE 802 Archive Index — 1994–1998</title>
<style>{SAFE_HTML['css']}</style>
<header>
  <h1>IEEE 802 Archive Index <span class=muted>— 1994–1998</span></h1>
</header>
<p>Select a year:</p>
<ul>
  {items}
</ul>
<footer class=dim>Focus: VLAN history (802.1Q) documents circa 1994–1998.</footer>
</html>
"""

# ----------------------------
# Main
# ----------------------------

def main():
    args = parse_args()
    build_per_year_index(args)

if __name__ == "__main__":
    main()