#!/usr/bin/env python3
"""
IEEE 802.1 Document Scanner
Scans specified year folders and creates an inventory of PDF and TXT files
"""

import os
import sqlite3
import json
from pathlib import Path
from datetime import datetime

# Load configuration
def load_config():
    config = {}
    if os.path.exists('.env'):
        with open('.env', 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    config[key.strip()] = value.strip()
    return config

config = load_config()
BASE_PATH = config.get('BASE_PATH', '/home/mark/files.serialport.org/ieee802')
YEARS = config.get('YEARS', '1994,1995,1996,1997,1998').split(',')

# Initialize database
def init_database():
    conn = sqlite3.connect('ieee_docs.db')
    c = conn.cursor()
    
    c.execute('''CREATE TABLE IF NOT EXISTS documents
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  filepath TEXT UNIQUE,
                  filename TEXT,
                  year TEXT,
                  extension TEXT,
                  filesize INTEGER,
                  scan_date TEXT,
                  processed INTEGER DEFAULT 0,
                  text_extracted INTEGER DEFAULT 0,
                  summary TEXT,
                  subgroup TEXT,
                  doc_number TEXT,
                  metadata TEXT)''')
    
    conn.commit()
    return conn

def scan_directory(base_path, years):
    """Scan directories for PDF and TXT files"""
    documents = []
    
    for year in years:
        year = year.strip()
        year_folder = os.path.join(base_path, f'docs{year}')
        
        if not os.path.exists(year_folder):
            print(f"Warning: {year_folder} does not exist, skipping...")
            continue
        
        print(f"Scanning {year_folder}...")
        
        for root, dirs, files in os.walk(year_folder):
            for filename in files:
                ext = os.path.splitext(filename)[1].lower()
                
                if ext in ['.pdf', '.txt']:
                    filepath = os.path.join(root, filename)
                    filesize = os.path.getsize(filepath)
                    
                    documents.append({
                        'filepath': filepath,
                        'filename': filename,
                        'year': year,
                        'extension': ext,
                        'filesize': filesize,
                        'scan_date': datetime.now().isoformat()
                    })
    
    return documents

def insert_documents(conn, documents):
    """Insert documents into database"""
    c = conn.cursor()
    
    inserted = 0
    updated = 0
    
    for doc in documents:
        try:
            c.execute('''INSERT INTO documents 
                        (filepath, filename, year, extension, filesize, scan_date)
                        VALUES (?, ?, ?, ?, ?, ?)''',
                     (doc['filepath'], doc['filename'], doc['year'], 
                      doc['extension'], doc['filesize'], doc['scan_date']))
            inserted += 1
        except sqlite3.IntegrityError:
            # Document already exists, update scan date
            c.execute('''UPDATE documents 
                        SET scan_date = ?, filesize = ?
                        WHERE filepath = ?''',
                     (doc['scan_date'], doc['filesize'], doc['filepath']))
            updated += 1
    
    conn.commit()
    return inserted, updated

def main():
    print("IEEE 802.1 Document Scanner")
    print("=" * 50)
    print(f"Base path: {BASE_PATH}")
    print(f"Years to scan: {', '.join(YEARS)}")
    print()
    
    # Initialize database
    conn = init_database()
    
    # Scan directories
    documents = scan_directory(BASE_PATH, YEARS)
    
    # Insert into database
    inserted, updated = insert_documents(conn, documents)
    
    # Print statistics
    print()
    print("=" * 50)
    print("Scan Complete!")
    print(f"Total documents found: {len(documents)}")
    print(f"New documents: {inserted}")
    print(f"Updated documents: {updated}")
    
    # Show breakdown by year and type
    c = conn.cursor()
    c.execute('''SELECT year, extension, COUNT(*) 
                 FROM documents 
                 GROUP BY year, extension 
                 ORDER BY year, extension''')
    
    print("\nBreakdown by year:")
    current_year = None
    for year, ext, count in c.fetchall():
        if year != current_year:
            print(f"\n{year}:")
            current_year = year
        print(f"  {ext}: {count} files")
    
    conn.close()

if __name__ == '__main__':
    main()
