import java.io.*;
import java.net.*;
import java.util.*;


/**
 * The ParserChooser class provides methods for choosing the proper parser
 * based on a file's name, mime type, or magic
 *
 * @version $Id: ParserChooser.java,v 1.10 2002/09/22 23:13:29 blsecres Exp $
 * @author Ben Secrest &lt;blsecres@users.sourceforge.net&gt;
 */
public class ParserChooser {
    /** The default logging level for this modules */
    private static final int LOGLEVEL = 9;

    /** The number of bytes to read while looking for file magic */
    private static final int MAGIC_HEADER_SIZE = 1024;

    /** The parser for use with Debian Packages. */
    private FileParser debParser;

    /** The parser for use with HTML documents. */
    private FileParser htmlParser;

    /** The parser for use with PDF documents. */
    private FileParser pdfParser;

    /** The parser for use with PNG images. */
    private FileParser pngParser;

    /** The parser for use with JPEG images. */
    private FileParser jpegParser;

    /** The parser for use with GIF images. */
    private FileParser gifParser;

    /** The parser for use with MP3 audio. */
    private FileParser mp3Parser;

    /** File extension to parser map */
    private HashMap extensionMap;

    /** Mime type to parser map */
    private HashMap mimeTypeMap;

    /** File magic to parser map */
    private HashMap magicMap;

    /** The set of items to parse from files */
    private IGKeySet wantedItems;

    /** The links extracted from the last file parsed */
    private String[] links;

    /** Determine if links should be extracted */
    private boolean wantLinks;

    /** The logging object */
    private IGLog log;

    /** A HashMap of http headers to send to remote servers */
    private HashMap httpHeaders;


    /**
     * Constructs a new ParserChooser object setting options for all parsers
     * @param log The object for logging
     * @param extract The set of desired variables to be extracted by each
     *   parser
     */
    public ParserChooser(IGLog logObj, IGKeySet wanted) {
	log = logObj;
	wantedItems = wanted;

	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "ParserChooser.ParserChooser(IGLog, "
		    + "IGKeySet)");
	if (LOGLEVEL >= IGLog.PROGRESS)
	    log.addResource(IGLog.PROGRESS, "PC_BUILD_FP", null);

	extensionMap = new HashMap(6);
	mimeTypeMap = new HashMap(6);
	magicMap = new HashMap(6);

	mp3Parser = new MP3Parser();
	initializeParser(mp3Parser);

	pngParser = new PNGParser();
	initializeParser(pngParser);

	jpegParser = new JPEGParser();
	initializeParser(jpegParser);

	gifParser = new GIFParser();
	initializeParser(gifParser);

	pdfParser = new PDFParser();
	initializeParser(pdfParser);

	debParser = new DebParser();
	initializeParser(debParser);

	//BLS htmlParser = new HTMLParser_FSM();
	htmlParser = new HTMLParser_RE();
	initializeParser(htmlParser);

	wantLinks = false;
	httpHeaders = null;
    }


    /**
     * Set the HTTP headers to sent to a remote server
     * @param headers A string array of headers as read from the config file
     */
    public void setHTTPHeaders(String[] headers) {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "ParserChooser.setHTTPHeaders(String[])");

	if (headers == null)
	    return;

	httpHeaders = new HashMap(headers.length);

	// convert the string array into a HashMap for easier access
	for (int i = 0; i < headers.length; i++) {
	    int index = headers[i].indexOf('=');

	    if (index == -1 || index == headers[i].length() - 1) {
		log.addWarning(46, "PC_BAD_HTTP_HEADER",
			new String[]{headers[i]});
		continue;
	    }

	    if (LOGLEVEL >= IGLog.PROGRESS)
		log.addResource(IGLog.PROGRESS, "PC_SET_HTTP_HEADER",
			new String[]{headers[i]});

	    httpHeaders.put(headers[i].substring(0, index),
		    headers[i].substring(index + 1));
	}
    }


    /**
     * Parse a file, filling in the requested information
     * @param file The IGFile structure to fill in
     */
    void parse(IGFile file) throws IOException, FileNotFoundException {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "ParserChooser.parse(IGFile[" + 
		    file.getLocation() + "])");
	
	IGLog.ui.scanProgress(file.getLocation());
	
	String filename = file.getLocation();

	if (LOGLEVEL >= IGLog.SECTION)
	    log.addResource(IGLog.SECTION, "PC_PARSE_FILE",
		    new String[]{filename});

	boolean useNetwork = IGMisc.isURL(filename);	// local or remote file
	int index;			// index for extracting file extensions
					// and mime types from strings
	FileParser parser = null;	// object used to parse files
	InputStream stream = null;	// stream for reading magic
	URL url = null;			// url for remote files
	URLConnection connection = null;// connection to remote hosts

	// behave differently if the filename references a local or remote file
	if (useNetwork) {
	    url = new URL(filename);
	    connection = url.openConnection();

	    if (connection instanceof HttpURLConnection) {
		// set the HTTP request headers
		if (httpHeaders != null)
		    for (Iterator i = httpHeaders.entrySet().iterator();
			    i.hasNext(); ) {
			Map.Entry entry = (Map.Entry) i.next();

			connection.setRequestProperty(entry.getKey().toString(),
				entry.getValue().toString());
		    }

		// check for a 404 response from the web server
		if (((HttpURLConnection) connection).getResponseCode()
			== HttpURLConnection.HTTP_NOT_FOUND)
		    throw new FileNotFoundException(filename);
	    }

	    // determine file's size
	    if (wantedItems.wants(IGKey.FILE_SIZE))
		file.put(IGKey.FILE_SIZE,
			Integer.toString(connection.getContentLength()));

	    // get content type for mime type based parser selection
	    String contentType = connection.getContentType();
	    // isolate the mime type portion of 'mime/type; charset=XYZ'
	    index = contentType.indexOf(';');

	    if (index != -1)
		contentType = contentType.substring(0, index);

	    if (LOGLEVEL >= IGLog.PROGRESS)
		log.addResource(IGLog.PROGRESS, "PC_FIND_PARSER",
			new String[]{log.getString("PC_MIME_TYPE"),
			    contentType});

	    // get a parser based on mime type
	    parser = (FileParser) mimeTypeMap.get(contentType);
	} else /* ! useNetwork */ {
	    // find parser based on locate file extension
	    index = filename.lastIndexOf('.');

	    // determine the file's length
	    if (wantedItems.wants(IGKey.FILE_SIZE))
		file.put(IGKey.FILE_SIZE, Long.toString(
			    (new File(filename).length())));

	    if (LOGLEVEL >= IGLog.PROGRESS)
		log.addResource(IGLog.PROGRESS, "PC_FIND_PARSER",
			new String[]{log.getString("PC_FILE_EXT"),
			    filename.substring(index + 1)});

	    // get a parser based on file extension
	    if (index != -1)
		parser = (FileParser) extensionMap.get(
			filename.substring(index + 1));
	}

	// no parser found based on first choice methods, check magic
        if (parser == null) {
	    // use a buffered stream so it can hopefully be reset
	    stream = new BufferedInputStream((useNetwork ?
			connection.getInputStream()
			: new FileInputStream(filename)));

	    if (LOGLEVEL >= IGLog.INFO)
		log.addResource(IGLog.INFO, "PC_USE_MAGIC", null);

	    // mark the stream so it can be reset after reading a header
	    if (useNetwork)
		stream.mark(MAGIC_HEADER_SIZE);

	    byte[] magicBuffer = new byte[MAGIC_HEADER_SIZE];
	    stream.read(magicBuffer);

	    parser = getParserByMagic(magicBuffer);

	    if (useNetwork) {
		// parser may need some of data that was read, reset stream to
		// beginning of file
		try {
		    stream.reset();
		} catch (IOException ioe) {
		    stream = url.openConnection().getInputStream();
		}
	    } else {
		// local files need to be opened by the parser using the
		// correct character encoding, close stream
		stream.close();
	    }
	}

	if (parser != null) {
	    if (LOGLEVEL >= IGLog.RESULT)
		log.addResource(IGLog.RESULT, "PC_USE_PARSER",
			new String[]{parser.getClass().getName()});

	    // pass on the current hyperlink extraction preference
	    if (parser instanceof LinkExtractor)
		((LinkExtractor) parser).wantURLs(wantLinks);

	    /*
	     * parse the document using the input stream and character
	     * encoding version of FileParser.parse()
	     */
	    if (useNetwork) {
		try {
		    parser.parse(file, (stream != null ? stream
				: connection.getInputStream()));
		} catch (StreamResetException sre) {
		    /*
		     * there was an error resetting the stream, the file must
		     * be retrieved again
		     */
		    try {
			parser.parse(file,
				url.openConnection().getInputStream());
		    } catch (StreamResetException sre2) {
			throw new IOException(
				"Stream Reset Exception shouldn't happen");
		    }
		}
	    } else {
		parser.parse(file);
	    }

	    if (wantedItems.wants(IGKey.FILE_NAME))
		    file.put(IGKey.FILE_NAME,
			IGMisc.basename(file.getLocation()));

	    // keep a copy of the links from the last document parsed
	    if (wantLinks && (parser instanceof LinkExtractor)) {
		links = ((LinkExtractor) parser).getLinks(file);
	    } else {
		links = null;
	    }
	} else {

	    throw new IOException("No parser for file.");
	}
    }


    /**
     * Allow access to the last set of links extracted from a file
     * @param file The IGFile to get links for
     * @return A string array of links
     */
    public String[] getLinks(IGFile file) {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "ParserChooser.getLinks(IGFile["
		    + file.getLocation() + "])");
	return links;
    }


    /**
     * Set link extraction preference
     * @param pref <tt>true</tt> to extract links, <tt>false</tt> to ignore
     * 	them
     */
    public void wantLinks(boolean pref) {
	wantLinks = pref;
    }


    /**
     * Initialize a parser for use.
     * @param parser The FileParser to initialize
     */
    private void initializeParser(FileParser parser) {
	String[] criteria;

	parser.setWantedItems(wantedItems);
	parser.setLog(log);

	criteria = parser.getExtensions();
	for (int i = 0; i < criteria.length; i++)
	    extensionMap.put(criteria[i], parser);
	criteria = parser.getMimeTypes();
	for (int i = 0; i < criteria.length; i++)
	    mimeTypeMap.put(criteria[i], parser);
	magicMap.put(parser.getMagic(), parser);
    }


    /**
     * Find a parser based on the various magic object given by the parsers
     * @param buffer The buffer to search for magic
     * @return The correct FileParser object for the given magic buffer or
     * 	<tt>null</tt> if one is not found
     */
    private FileParser getParserByMagic(byte[] buffer) {
	for (Iterator i = magicMap.keySet().iterator(); i.hasNext(); ) {
	    FileMagic fileMagic = (FileMagic) i.next();
	    if (fileMagic.magicMatches(buffer)) {
		return ((FileParser) magicMap.get(fileMagic));
	    }
	}

	return null;
    }
}
