yacy_search_server/source/net/yacy/document/parser/augment/AugmentParser.java

package net.yacy.document.parser.augment;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import net.yacy.document.Document;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import de.anomic.data.ymark.YMarkUtil;


public class AugmentParser extends RDFaParser {

	public AugmentParser(String name) {
		super(name);

		System.out.println("augmented parser was initialized");

		this.SUPPORTED_EXTENSIONS.remove("htm");
		this.SUPPORTED_EXTENSIONS.remove("html");
		this.SUPPORTED_EXTENSIONS.remove("shtml");
		this.SUPPORTED_EXTENSIONS.remove("xhtml");
		this.SUPPORTED_EXTENSIONS.remove("php");
		this.SUPPORTED_EXTENSIONS.remove("php3");
		this.SUPPORTED_EXTENSIONS.remove("php4");
		this.SUPPORTED_EXTENSIONS.remove("php5");
		this.SUPPORTED_EXTENSIONS.remove("cfm");
		this.SUPPORTED_EXTENSIONS.remove("asp");
		this.SUPPORTED_EXTENSIONS.remove("aspx");
		this.SUPPORTED_EXTENSIONS.remove("tex");
		this.SUPPORTED_EXTENSIONS.remove("txt");
		this.SUPPORTED_EXTENSIONS.remove("jsp");
		this.SUPPORTED_EXTENSIONS.remove("mf");
		this.SUPPORTED_EXTENSIONS.remove("pl");
		this.SUPPORTED_EXTENSIONS.remove("py");
		this.SUPPORTED_MIME_TYPES.remove("text/html");
		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
		this.SUPPORTED_MIME_TYPES.remove("text/plain");
		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
		this.SUPPORTED_MIME_TYPES.remove("text/csv");

		this.SUPPORTED_EXTENSIONS.add("html");
		this.SUPPORTED_EXTENSIONS.add("php");
		this.SUPPORTED_MIME_TYPES.add("text/html");
		this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
		this.SUPPORTED_EXTENSIONS.add("html");
		this.SUPPORTED_EXTENSIONS.add("htm");
	}

	@Override
	public Document[] parse(DigestURI url, String mimeType,
			String charset, InputStream source) throws Failure,
			InterruptedException {

		Document[] htmlDocs = super.parse(url, mimeType, charset, source);
		try {
			source.reset();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		Document alreadyParsedDocument = htmlDocs[0];

		Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset, source);


		Document augmentDoc = parseAndAugment(url, mimeType, charset, source);


		Document[] retDocs = new Document[htmlDocs.length + 2];
		for (int i = 0; i < htmlDocs.length; i++) {
			retDocs[i] = htmlDocs[i];
		}

		retDocs[retDocs.length - 1] = augmentDoc;
		retDocs[retDocs.length - 2] = superDoc;

		return retDocs;

	}

	private Document analyze (Document alreadyParsedDocument, DigestURI url,
			String mimeType, String charset, InputStream source) {

		Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
				"", null, "", 0, 0, null, null, null, null, false);

		// if the magic word appears in the document, perform extra actions.


		if (alreadyParsedDocument.getKeywords().contains("magicword")) {
			String all = "";

			all = "yacylatest";
			newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
					"", null, "", 0, 0, all.getBytes(), null, null, null, false);
		}

		return newDoc;
	}


	private Document parseAndAugment(DigestURI url,
			String mimeType, String charset, InputStream source) {

		String all = "";

		Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
				"", null, "", 0, 0, all.getBytes(), null, null, null, false);


		Iterator<net.yacy.kelondro.blob.Tables.Row> it;
		try {
			it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");

			it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();

			while (it.hasNext()) {
				net.yacy.kelondro.blob.Tables.Row r = it.next();

				if (r.get("url", "").equals (url.toNormalform(false, false))) {

					Set<String> tags = new HashSet<String>();

					for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {

						tags.add(s);

					}


					newDoc.addTags(tags);

				}
			}


		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}


		return newDoc;
	}


}