mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge branch 'master' of git://gitorious.org/~reger/yacy/bbyacy-rc1
This commit is contained in:
commit
b991685782
File diff suppressed because it is too large
Load Diff
|
@ -79,7 +79,7 @@ public class Document {
|
|||
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
|
||||
private final StringBuilder creator; // author or copyright
|
||||
private final String publisher; // publisher
|
||||
private final List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private final StringBuilder description; // an abstract, if present: short content description
|
||||
private Object text; // the clear text, all that is visible
|
||||
private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
|
||||
|
@ -631,7 +631,17 @@ dc_rights
|
|||
|
||||
public void addSubDocuments(final Document[] docs) throws IOException {
|
||||
for (final Document doc: docs) {
|
||||
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
|
||||
// check class as addAll method might not be available if initialized via Arrays.toList
|
||||
if (this.sections.getClass() == java.util.LinkedList.class) {
|
||||
this.sections.addAll(doc.sections);
|
||||
} else {
|
||||
/* sections might be initialized via Arrays.toList (which does not implement the addAll method)
|
||||
so new list must be assigned */
|
||||
LinkedList<String> tmplist = new LinkedList();
|
||||
tmplist.addAll(this.sections);
|
||||
tmplist.addAll(doc.sections);
|
||||
this.sections = tmplist;
|
||||
}
|
||||
this.titles.addAll(doc.titles());
|
||||
this.keywords.addAll(doc.getKeywords());
|
||||
|
||||
|
|
|
@ -1,106 +1,113 @@
|
|||
package net.yacy.document.parser.augment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.data.ymark.YMarkUtil;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.parser.rdfa.impl.RDFaParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
||||
|
||||
public class AugmentParser extends AbstractParser implements Parser {
|
||||
|
||||
RDFaParser rdfaParser;
|
||||
|
||||
public AugmentParser() {
|
||||
super("AugmentParser");
|
||||
this.rdfaParser = new RDFaParser();
|
||||
|
||||
Log.logInfo("AugmentedParser", "augmented parser was initialized");
|
||||
|
||||
this.SUPPORTED_EXTENSIONS.add("html");
|
||||
this.SUPPORTED_EXTENSIONS.add("php");
|
||||
this.SUPPORTED_MIME_TYPES.add("text/html");
|
||||
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
|
||||
this.SUPPORTED_EXTENSIONS.add("html");
|
||||
this.SUPPORTED_EXTENSIONS.add("htm");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
|
||||
|
||||
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
Document alreadyParsedDocument = htmlDocs[0];
|
||||
Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset);
|
||||
Document augmentDoc = parseAndAugment(url, mimeType, charset);
|
||||
Document[] retDocs = new Document[htmlDocs.length + 2];
|
||||
for (int i = 0; i < htmlDocs.length; i++) {
|
||||
retDocs[i] = htmlDocs[i];
|
||||
}
|
||||
|
||||
retDocs[retDocs.length - 1] = augmentDoc;
|
||||
retDocs[retDocs.length - 2] = superDoc;
|
||||
return retDocs;
|
||||
}
|
||||
|
||||
private static Document analyze (Document alreadyParsedDocument, DigestURI url,
|
||||
String mimeType, String charset) {
|
||||
|
||||
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, null, null, null, null, false);
|
||||
|
||||
// if the magic word appears in the document, perform extra actions.
|
||||
if (alreadyParsedDocument.getKeywords().contains("magicword")) {
|
||||
String all = "";
|
||||
all = "yacylatest";
|
||||
newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
}
|
||||
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
private Document parseAndAugment(DigestURI url, String mimeType, String charset) {
|
||||
|
||||
String all = "";
|
||||
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
|
||||
Iterator<net.yacy.kelondro.blob.Tables.Row> it;
|
||||
try {
|
||||
it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
|
||||
it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
|
||||
while (it.hasNext()) {
|
||||
net.yacy.kelondro.blob.Tables.Row r = it.next();
|
||||
if (r.get("url", "").equals (url.toNormalform(false))) {
|
||||
Set<String> tags = new HashSet<String>();
|
||||
for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
|
||||
tags.add(s);
|
||||
}
|
||||
newDoc.addTags(tags);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
package net.yacy.document.parser.augment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.data.ymark.YMarkUtil;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.parser.rdfa.impl.RDFaParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
||||
|
||||
public class AugmentParser extends AbstractParser implements Parser {
|
||||
|
||||
RDFaParser rdfaParser;
|
||||
|
||||
public AugmentParser() {
|
||||
super("AugmentParser");
|
||||
this.rdfaParser = new RDFaParser();
|
||||
|
||||
Log.logInfo("AugmentedParser", "augmented parser was initialized");
|
||||
|
||||
this.SUPPORTED_EXTENSIONS.add("html");
|
||||
this.SUPPORTED_EXTENSIONS.add("php");
|
||||
this.SUPPORTED_MIME_TYPES.add("text/html");
|
||||
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
|
||||
this.SUPPORTED_EXTENSIONS.add("html");
|
||||
this.SUPPORTED_EXTENSIONS.add("htm");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
|
||||
|
||||
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
Document alreadyParsedDocument = htmlDocs[0];
|
||||
Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset);
|
||||
Document augmentDoc = parseAndAugment(url, mimeType, charset);
|
||||
Document[] retDocs = new Document[htmlDocs.length + 1];
|
||||
for (int i = 1; i < htmlDocs.length; i++) {
|
||||
retDocs[i - 1] = htmlDocs[i];
|
||||
}
|
||||
|
||||
retDocs[retDocs.length - 1] = augmentDoc;
|
||||
retDocs[retDocs.length - 2] = superDoc;
|
||||
try { // merge additional result docs into the parse main document
|
||||
alreadyParsedDocument.addSubDocuments(retDocs);
|
||||
} catch (IOException ex) {
|
||||
Log.logException(ex);
|
||||
}
|
||||
Document[] finalretDocs = new Document[1]; // return the merged document
|
||||
finalretDocs[0] = alreadyParsedDocument;
|
||||
return finalretDocs;
|
||||
}
|
||||
|
||||
private static Document analyze (Document alreadyParsedDocument, DigestURI url,
|
||||
String mimeType, String charset) {
|
||||
|
||||
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, null, null, null, null, false);
|
||||
|
||||
// if the magic word appears in the document, perform extra actions.
|
||||
if (alreadyParsedDocument.getKeywords().contains("magicword")) {
|
||||
String all = "";
|
||||
all = "yacylatest";
|
||||
newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
}
|
||||
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private Document parseAndAugment(DigestURI url, String mimeType, String charset) {
|
||||
|
||||
String all = "";
|
||||
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
|
||||
Iterator<net.yacy.kelondro.blob.Tables.Row> it;
|
||||
try {
|
||||
it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
|
||||
it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
|
||||
while (it.hasNext()) {
|
||||
net.yacy.kelondro.blob.Tables.Row r = it.next();
|
||||
if (r.get("url", "").equals (url.toNormalform(false))) {
|
||||
Set<String> tags = new HashSet<String>();
|
||||
for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
|
||||
tags.add(s);
|
||||
}
|
||||
newDoc.addTags(tags);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user