added a html field scraper which reads text from html entities of a

given css class and extends a given vocabulary with a term consisting
with the text content of the html class tag. Additionally, the term is
included into the semantic facet of the document. This allows the
creation of faceted search to documents without the pre-creation of
vocabularies; instead, the vocabulary is created on-the-fly, possibly
for use in other crawls. If any of the term scraping for a specific
vocabulary is successful on a document, this vocabulary is excluded for
auto-annotation on the page.

To use this feature, do the following:
- create a vocabulary on /Vocabulary_p.html (if not existent)
- in /CrawlStartExpert.html you will now see the vocabularies as column
in a table. The second column provides text fields where you can name
the class of html entities where the literal of the corresponding
vocabulary shall be scraped out
- when doing a search, you will see the content of the scraped fields in
a navigation facet for the given vocabulary
This commit is contained in:
Michael Peter Christen 2015-01-30 13:20:56 +01:00
parent 1cb290170e
commit b5ac29c9a5
59 changed files with 419 additions and 141 deletions

View File

@ -197,7 +197,6 @@ public class ConfigHeuristics_p {
return prop;
}
@SuppressWarnings("unused")
private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) {
// read index schema table flags

View File

@ -443,7 +443,7 @@
<fieldset>
<legend>Robot Behaviour</legend>
<dl>
<dt><label for="collection">Use Special User Agent and robot identification</label></dt>
<dt><label>Use Special User Agent and robot identification</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
@ -460,6 +460,30 @@
</dl>
</fieldset>
#(/agentSelect)#
#(vocabularySelect)#::
<fieldset>
<legend>Enrich Vocabulary</legend>
<dl>
<dt><label>Scraping Fields</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You can use class names to enrich the terms of a vocabulary based on the text content that appears on web pages. Please write the names of classes into the matrix.
</span></span>
<table class="table table-condensed">
<tr><th>Vocabulary</th><th>Class</th></tr>
#{vocabularyset}#
<tr>
<td>#[name]#</td>
<td><input name="vocabulary_#[name]#_class" id="vocabulary_#[name]#_class" type="text" size="55" maxlength="1028" value="#[value]#" /></td>
</tr>
#{/vocabularyset}#
</table>
</dd>
</dl>
</fieldset>
#(/vocabularySelect)#
<fieldset>
<legend>Snapshot Creation</legend>
<dl>

View File

@ -25,12 +25,15 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
@ -508,9 +511,22 @@ public class CrawlStartExpert {
prop.put("agentSelect_list", agentNames.size());
}
prop.put("agentSelect_defaultAgentName",
ClientIdentification.yacyInternetCrawlerAgentName);
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {
prop.put("vocabularySelect", 0);
} else {
prop.put("vocabularySelect", 1);
int count = 0;
for (Tagging v: vocs) {
prop.put("vocabularySelect_vocabularyset_" + count + "_name", v.getName());
prop.put("vocabularySelect_vocabularyset_" + count + "_value", "");
count++;
}
prop.put("vocabularySelect_vocabularyset", count);
}
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();

View File

@ -42,6 +42,8 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.JSONException;
import net.yacy.cora.util.JSONObject;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache;
@ -51,6 +53,7 @@ import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet;
@ -445,6 +448,27 @@ public class Crawler_p {
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for (String key: post.keySet()) {
if (key.startsWith("vocabulary_")) {
if (key.endsWith("_class")) {
String vocabulary = key.substring(11, key.length() - 6);
String value = post.get(key);
if (value != null && value.length() > 0) {
JSONObject props;
try {
props = vocabulary_scraper.getJSONObject(vocabulary);
} catch (JSONException e) {
props = new JSONObject();
vocabulary_scraper.put(vocabulary, props);
}
props.put("class", value);
}
}
}
}
// prepare a new crawling profile
final CrawlProfile profile;
byte[] handle;
@ -476,7 +500,8 @@ public class Crawler_p {
snapshotsReplaceOld,
cachePolicy,
collection,
agentName);
agentName,
new VocabularyScraper(vocabulary_scraper));
handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
@ -559,7 +584,7 @@ public class Crawler_p {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000);
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper());
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);

View File

@ -155,7 +155,8 @@ public class QuickCrawlLink_p {
-1, false, true,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist

View File

@ -29,7 +29,7 @@ import net.yacy.visualization.RasterPlotter.DrawMode;
public class osm {
public static EncodedImage respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
public static EncodedImage respond(final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
int zoom = 10;
double lat = 50.11670d;

View File

@ -23,6 +23,7 @@ package net.yacy.cora.language.synonyms;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@ -94,6 +95,13 @@ public class AutotaggingLibrary {
return this.vocabularies.get(name);
}
public Set<String> getVocabularyNames() {
// this must return a clone of the set to prevent that the vocabularies are destroyed in a side effect
HashSet<String> names = new HashSet<>();
names.addAll(this.vocabularies.keySet());
return names;
}
public Collection<Tagging> getVocabularies() {
return this.vocabularies.values();
}
@ -143,14 +151,17 @@ public class AutotaggingLibrary {
return 4;
}
public Tagging.Metatag getTagFromTerm(String term) {
public Tagging.Metatag getTagFromTerm(Set<String> vocabularies, String term) {
if (this.vocabularies.isEmpty()) return null;
Tagging.Metatag tag;
term = Tagging.normalizeTerm(term);
for (Map.Entry<String, Tagging> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatagFromSynonym(term);
for (String vocabularyName: vocabularies) {
Tagging t = this.vocabularies.get(vocabularyName);
if (t != null) {
tag = t.getMetatagFromSynonym(term);
if (tag != null) return tag;
}
}
return null;
}

View File

@ -275,6 +275,7 @@ public class Tagging {
public void put(String term, String synonyms, String objectlink) throws IOException {
if (this.propFile == null) return;
synchronized (this) {
TempFile tmp = new TempFile();
BlockingQueue<String> list = Files.concurentLineReader(this.propFile);
String line;
@ -302,6 +303,7 @@ public class Tagging {
tmp.file.renameTo(this.propFile);
init();
}
}
public void delete(String term) throws IOException {
if (this.propFile == null) return;

View File

@ -295,7 +295,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName);
ClientIdentification.yacyProxyAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
@ -325,7 +326,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.yacyInternetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
@ -355,7 +357,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
@ -385,7 +388,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -416,7 +420,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName);
ClientIdentification.browserAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -446,7 +451,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
@ -476,7 +482,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
@ -506,7 +513,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);
@ -539,7 +547,8 @@ public final class CrawlSwitchboard {
-1, false, true,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName);
ClientIdentification.yacyIntranetCrawlerAgentName,
null);
this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile);
this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile;

View File

@ -45,6 +45,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams;
import net.yacy.server.serverObjects;
@ -78,6 +79,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String COLLECTIONS = "collections";
public static final String SCRAPER = "scraper";
public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch";
public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch";
public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch";
@ -99,6 +101,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final VocabularyScraper scraper;
/**
* Constructor which creates CrawlPofile from parameters.
@ -151,7 +154,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean snapshotsReplaceOld,
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName) {
final String userAgentName,
final VocabularyScraper scraper) {
super(40);
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
@ -189,16 +193,27 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
this.scraper = scraper == null ? new VocabularyScraper() : scraper;
String jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(SCRAPER, jsonString);
}
/**
* Constructor which creats a CrawlProfile from values in a Map.
* Constructor which creates a CrawlProfile from values in a Map.
* @param ext contains values
*/
public CrawlProfile(final Map<String, String> ext) {
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(SCRAPER);
this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
}
public VocabularyScraper scraper() {
return this.scraper;
}
public void domInc(final String domain) {

View File

@ -44,6 +44,7 @@ import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.search.Switchboard;
public class Response {
@ -864,7 +865,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.request.depth(), this.content);
return TextParser.parseSource(new AnchorURL(url()), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), new VocabularyScraper(), this.request.depth(), this.content);
} catch (final Exception e) {
return null;
}

View File

@ -52,6 +52,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.BookmarksDB.Bookmark;
import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.word.Word;
@ -138,7 +139,7 @@ public class BookmarkHelper {
final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
try {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL, 10000);
final ContentScraper scraper = new ContentScraper(baseURL, 10000, new VocabularyScraper());
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);

View File

@ -87,7 +87,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false, false).words();
final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;

View File

@ -189,7 +189,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
-1, false, true,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard
ClientIdentification.yacyIntranetCrawlerAgentName,
null); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),

View File

@ -45,6 +45,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.language.synonyms.SynonymLibrary;
@ -91,6 +92,7 @@ public final class Condenser {
public Condenser(
final Document document,
final VocabularyScraper scraper,
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
@ -122,7 +124,7 @@ public final class Condenser {
if (indexText) {
String text = document.getTextString();
if (findDatesInContent) this.dates_in_content = DateDetection.parse(text);
createCondensement(text, meaningLib, doAutotagging);
createCondensement(document.dc_source(), text, meaningLib, doAutotagging, scraper);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
@ -249,12 +251,12 @@ public final class Condenser {
this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text);
}
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
private Condenser(final DigestURL root, final String text, final WordCache meaningLib, final boolean doAutotagging, final VocabularyScraper scraper) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging);
createCondensement(root, text, meaningLib, doAutotagging, scraper);
}
private void insertTextToWords(
@ -324,7 +326,7 @@ public final class Condenser {
return this.languageIdentificator.getLanguage();
}
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
private void createCondensement(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) {
assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String word = "";
@ -355,7 +357,29 @@ public final class Condenser {
// get tags from autotagging
if (doAutotagging) {
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
//Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();
//assert vocabularyNames.size() == vocabularies.size();
Map<String, String> vocMap = scraper.removeVocMap(root);
if (vocMap != null) {
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
String navigatorName = entry.getKey();
String term = entry.getValue();
vocabularyNames.remove(navigatorName);
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
if (vocabulary != null) {
// extend the vocabulary
String obj = vocabulary.getObjectlink(term);
if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
// create annotation
tag = vocabulary.getMetatagFromTerm(term);
Set<Tagging.Metatag> tagset = new HashSet<>();
tagset.add(tag);
this.tags.put(navigatorName, tagset);
}
}
}
if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder();
if (wordc == 1) {
@ -368,7 +392,7 @@ public final class Condenser {
}
String testterm = sb.toString().trim();
//System.out.println("Testing: " + testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
if (tag != null) {
String navigatorName = tag.getVocabularyName();
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
@ -461,7 +485,7 @@ public final class Condenser {
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
return new Condenser(text, meaningLib, false).words();
return new Condenser(null, text, meaningLib, false, null).words();
}
public static void main(final String[] args) {

View File

@ -48,6 +48,7 @@ public interface Parser {
* @param url the url of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset of the source, if known
* @param scraper an entity scraper to detect facets from text annotation context
* @param source a input stream
* @return a list of documents that result from parsing the source
* @throws Parser.Failure
@ -57,6 +58,7 @@ public interface Parser {
AnchorURL url,
String mimeType,
String charset,
VocabularyScraper scraper,
InputStream source
) throws Parser.Failure, InterruptedException;

View File

@ -166,6 +166,7 @@ public final class TextParser {
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int depth,
final File sourceFile
) throws InterruptedException, Parser.Failure {
@ -180,7 +181,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, depth, sourceFile.length(), sourceStream);
docs = parseSource(location, mimeType, charset, scraper, depth, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -197,6 +198,7 @@ public final class TextParser {
final AnchorURL location,
String mimeType,
final String charset,
final VocabularyScraper scraper,
final int depth,
final byte[] content
) throws Parser.Failure {
@ -212,7 +214,7 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, depth, content);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, content);
return docs;
}
@ -221,6 +223,7 @@ public final class TextParser {
final AnchorURL location,
String mimeType,
final String charset,
final VocabularyScraper scraper,
final int depth,
final long contentLength,
final InputStream sourceStream
@ -241,7 +244,7 @@ public final class TextParser {
// then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.iterator().next(), charset, sourceStream);
return parseSource(location, mimeType, idioms.iterator().next(), charset, scraper, sourceStream);
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -252,7 +255,7 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
Document[] docs = parseSource(location, mimeType, idioms, charset, depth, b);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, depth, b);
return docs;
}
@ -262,6 +265,7 @@ public final class TextParser {
final String mimeType,
final Parser parser,
final String charset,
final VocabularyScraper scraper,
final InputStream sourceStream
) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
@ -271,7 +275,7 @@ public final class TextParser {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, sourceStream);
return docs;
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -283,6 +287,7 @@ public final class TextParser {
final String mimeType,
final Set<Parser> parsers,
final String charset,
final VocabularyScraper scraper,
final int depth,
final byte[] sourceArray
) throws Parser.Failure {
@ -305,7 +310,7 @@ public final class TextParser {
bis = new ByteArrayInputStream(sourceArray);
}
try {
docs = parser.parse(location, mimeType, documentCharset, bis);
docs = parser.parse(location, mimeType, documentCharset, scraper, bis);
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);

View File

@ -0,0 +1,90 @@
/**
* VocabularyScraper
* Copyright 2015 by Michael Peter Christen
* First released 30.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.JSONException;
import net.yacy.cora.util.JSONObject;
import net.yacy.kelondro.io.CharBuffer;
public class VocabularyScraper {
private final JSONObject scraperDefinition;
private Map<String, String> classVocabulary; // a mapping from class names to the vocabulary where this class should be mapped
private final Map<DigestURL, ConcurrentHashMap<String, String>> vocMap; // a mapping from a document to a map from vocabularies to terms
public VocabularyScraper() {
this.classVocabulary = null;
this.scraperDefinition = new JSONObject();
this.vocMap = new ConcurrentHashMap<>();
}
public VocabularyScraper(JSONObject init) {
// init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name
this.scraperDefinition = init == null ? new JSONObject() : init;
this.vocMap = new ConcurrentHashMap<>();
if (this.scraperDefinition.length() == 0) {
this.classVocabulary = null;
} else {
this.classVocabulary = new ConcurrentHashMap<>();
for (String voc: this.scraperDefinition.keySet()) {
JSONObject props = this.scraperDefinition.getJSONObject(voc);
try {
String classtype = props.getString("class");
this.classVocabulary.put(classtype, voc);
} catch (JSONException e) {}
}
if (this.classVocabulary.size() == 0) this.classVocabulary = null;
}
}
public VocabularyScraper(String init) {
this(new JSONObject(init));
}
@Override
public String toString() {
return this.scraperDefinition.toString();
}
public void check(DigestURL root, String className, CharBuffer content) {
if (this.classVocabulary == null) return;
String voc = this.classVocabulary.get(className);
if (voc == null) return;
// record the mapping
ConcurrentHashMap<String, String> vocmap = this.vocMap.get(root);
if (vocmap == null) {
synchronized (this) {
vocmap = new ConcurrentHashMap<>();
this.vocMap.put(root, vocmap);
}
}
if (!vocmap.containsKey(voc)) vocmap.put(voc, content.toString()); // we put only the first occurrence of the entity into the vocmap
}
public Map<String, String> removeVocMap(DigestURL root) {
return this.vocMap.remove(root);
}
}

View File

@ -62,6 +62,7 @@ import net.yacy.data.wiki.WikiParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.content.SurrogateReader;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -523,7 +524,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
this.url = new AnchorURL(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", 1, UTF8.getBytes(this.html));
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", new VocabularyScraper(), 1, UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);

View File

@ -43,6 +43,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class apkParser extends AbstractParser implements Parser {
@ -53,7 +54,7 @@ public class apkParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
/*
* things to discover:

View File

@ -41,6 +41,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.jaudiotagger.audio.AudioFile;
import org.jaudiotagger.audio.AudioFileIO;
@ -70,7 +71,7 @@ public class audioTagParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();

View File

@ -13,6 +13,7 @@ import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.blob.Tables;
import net.yacy.search.Switchboard;
@ -37,9 +38,9 @@ public class AugmentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(AnchorURL url, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, scraper, source);
for (final Document doc : htmlDocs) {
/* analyze(doc, url, mimeType, charset); // enrich document text */

View File

@ -36,6 +36,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
@ -57,7 +58,7 @@ public class bzipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;
@ -94,7 +95,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, 999, tempFile);
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

View File

@ -38,6 +38,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/**
* a parser for comma-separated values
@ -52,7 +53,7 @@ public class csvParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.

View File

@ -35,6 +35,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hwpf.extractor.WordExtractor;
@ -57,7 +58,7 @@ public class docParser extends AbstractParser implements Parser {
@SuppressWarnings("deprecation")
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
final WordExtractor extractor;

View File

@ -29,6 +29,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.poi.util.StringUtil;
@ -60,7 +61,7 @@ public class dwgParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true))

View File

@ -32,6 +32,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/**
* this parser can parse just anything because it uses only the uri/file/path information
@ -46,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source1)
final String charset, final VocabularyScraper scraper, final InputStream source1)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();
final Document[] docs = new Document[]{new Document(

View File

@ -37,6 +37,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
@ -55,7 +56,7 @@ public class gzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs = null;
@ -79,7 +80,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, 999, tempFile);
docs = TextParser.parseSource(location, null, null, scraper, 999, tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

View File

@ -59,6 +59,7 @@ import net.yacy.cora.storage.SizeLimitedSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser;
@ -88,7 +89,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public enum TagName {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
img(TagType.singleton),
base(TagType.singleton),
frame(TagType.singleton),
@ -115,7 +115,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
i(TagType.pair),
li(TagType.pair),
script(TagType.pair),
style(TagType.pair);
span(TagType.pair),
div(TagType.pair);
public TagType type;
private TagName(final TagType type) {
@ -185,6 +186,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private double lon, lat;
private AnchorURL canonical, publisher;
private final int maxLinks;
private final VocabularyScraper vocabularyScraper;
private int breadcrumbs;
@ -203,14 +205,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
private final Evaluation evaluationScores;
/**
* scrape a document
* @param root the document root url
* @param maxLinks the maximum number of links to scapre
* @param classDetector a map from class names to vocabulary names to scrape content from the DOM with associated class name
*/
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURL root, int maxLinks) {
public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScraper vocabularyScraper) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.maxLinks = maxLinks;
this.vocabularyScraper = vocabularyScraper;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
@ -392,15 +401,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
} else if (tag.name.equalsIgnoreCase("body")) {
final String c = tag.opts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c);
} else if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, classprop);
} else if (tag.name.equalsIgnoreCase("meta")) {
final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING);
@ -509,6 +511,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeTag1(Tag tag) {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
//System.out.println("class = " + classprop);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING);
@ -536,7 +541,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.apath, href);
}
final String h;
if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h);
} else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
@ -601,7 +613,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks);
final ContentScraper scraper = new ContentScraper(this.root, this.maxLinks, this.vocabularyScraper);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
@ -1090,13 +1102,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new DigestURL("http://localhost"),null,false, maxLinks);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), "UTF-8", new VocabularyScraper(), new DigestURL("http://localhost"), null, false, maxLinks);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks);
final ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new VocabularyScraper());
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();

View File

@ -37,6 +37,7 @@ import java.util.Properties;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.VocabularyScraper;
public class ScraperInputStream extends InputStream implements ScraperListener {
@ -59,6 +60,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final VocabularyScraper vocabularyScraper,
final DigestURL rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
@ -68,7 +70,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks);
final ContentScraper scraper = new ContentScraper(rooturl, maxLinks, vocabularyScraper);
scraper.registerHtmlFilterEventListener(this);
try {

View File

@ -45,6 +45,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
@ -86,13 +87,13 @@ public class htmlParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset,
final String documentCharset, final VocabularyScraper vocscraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
@ -150,7 +151,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException {
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, final VocabularyScraper vocabularyScraper, String input, int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
@ -160,7 +161,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper;
try {
scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, sourceStream, maxLinks);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
@ -170,6 +171,7 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
final VocabularyScraper vocabularyScraper,
Charset[] detectedcharsetcontainer,
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {
@ -186,7 +188,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, vocabularyScraper, location, null, false, maxLinks);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (final IOException e1) {
@ -220,7 +222,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
@ -322,7 +324,7 @@ public class htmlParser extends AbstractParser implements Parser {
try {
url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content));
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new VocabularyScraper(), new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
} catch (final MalformedURLException e) {

View File

@ -53,6 +53,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
import net.yacy.kelondro.util.FileUtils;
@ -92,7 +93,7 @@ public class genericImageParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset,
final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
ImageInfo ii = null;
@ -314,7 +315,7 @@ public class genericImageParser extends AbstractParser implements Parser {
AnchorURL uri;
try {
uri = new AnchorURL("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image));
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURL.getFileExtension(uri.getFileName()), "UTF-8", new VocabularyScraper(), new FileInputStream(image));
System.out.println(document[0].toString());
} catch (final MalformedURLException e) {
e.printStackTrace();

View File

@ -33,6 +33,7 @@ import com.drew.metadata.Directory;
import com.drew.metadata.Metadata;
import com.drew.metadata.Tag;
import com.drew.metadata.exif.GpsDirectory;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedInputStream;
@ -42,11 +43,13 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/**
@ -84,7 +87,7 @@ public class metadataImageParser extends AbstractParser implements Parser {
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String documentCharset,
final String documentCharset, final VocabularyScraper scraper,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
String title = null;

View File

@ -28,6 +28,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/**
* This parser is used if we know that the content is text but the exact format is unknown.
@ -59,10 +60,10 @@ public class linkScraperParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, source);
Document[] htmlParserDocs = new htmlParser().parse(location, mimeType, charset, scraper, source);
Document htmlParserDoc = htmlParserDocs == null ? null : Document.mergeDocuments(location, mimeType, htmlParserDocs);

View File

@ -39,6 +39,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@ -71,7 +72,7 @@ public class mmParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException
{
final StringBuilder sb = new StringBuilder();

View File

@ -48,6 +48,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.io.CharBuffer;
@ -215,7 +216,7 @@ public class odtParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

View File

@ -48,6 +48,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.io.CharBuffer;
@ -201,7 +202,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
File dest = null;
try {
// creating a tempfile

View File

@ -59,6 +59,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -85,7 +86,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -375,7 +376,7 @@ public class pdfParser extends AbstractParser implements Parser {
final AbstractParser parser = new pdfParser();
Document document = null;
try {
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile)));
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), new FileInputStream(pdfFile)));
} catch (final Parser.Failure e) {
System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
ConcurrentLog.logException(e);

View File

@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
@ -62,7 +63,7 @@ public class pptParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
InterruptedException {
try {
/*

View File

@ -41,6 +41,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
@ -258,7 +259,7 @@ public class psParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;

View File

@ -34,6 +34,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class rdfParser extends AbstractParser implements Parser {
@ -46,7 +47,7 @@ public class rdfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException {

View File

@ -23,6 +23,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple;
@ -48,10 +49,10 @@ public class RDFaParser extends AbstractParser implements Parser {
@Override
public Document[] parse(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure,
String charset, final VocabularyScraper scraper, InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, source);
Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
@ -97,12 +98,12 @@ public class RDFaParser extends AbstractParser implements Parser {
}
private Document[] parseHtml(AnchorURL url, String mimeType,
String charset, InputStream source) throws Failure,
String charset, VocabularyScraper scraper, InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = null;
try {
htmlDocs = this.hp.parse(url, mimeType, charset, source);
htmlDocs = this.hp.parse(url, mimeType, charset, scraper, source);
source.reset();
} catch (final IOException e1) {
@ -179,7 +180,7 @@ public class RDFaParser extends AbstractParser implements Parser {
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new AnchorURL(args[0]),"","",aURL.openStream());
aParser.parse(new AnchorURL(args[0]), "", "", new VocabularyScraper(), aURL.openStream());
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {

View File

@ -43,6 +43,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser {
@ -59,7 +60,7 @@ public class rssParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
try {

View File

@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
public class rtfParser extends AbstractParser implements Parser {
@ -53,7 +54,7 @@ public class rtfParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
try {

View File

@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
@ -105,7 +106,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
@ -171,7 +172,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.doc.getDepth() + 1, this.cfos.toByteArray());
theDocs = TextParser.parseSource(url, mime, null, new VocabularyScraper(), this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

View File

@ -35,6 +35,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
// this is a new implementation of this parser idiom using multiple documents as result set
@ -58,7 +59,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
try {
final int available = source.available();

View File

@ -51,6 +51,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.io.ByteCountInputStream;
@ -70,7 +71,7 @@ public class sitemapParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Failure, InterruptedException {
final List<Document> docs = new ArrayList<Document>();
SitemapReader sitemap = new SitemapReader(source, ClientIdentification.yacyInternetCrawlerAgent);

View File

@ -37,6 +37,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import pt.tumba.parser.swf.SWF2HTML;
public class swfParser extends AbstractParser implements Parser {
@ -56,7 +57,7 @@ public class swfParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException
{

View File

@ -40,6 +40,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.apache.tools.tar.TarEntry;
@ -61,7 +62,7 @@ public class tarParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
@ -90,7 +91,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, 999, tmp);
subDocs = TextParser.parseSource(AnchorURL.newAnchor(url, "#" + name), mime, null, scraper, 999, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {

View File

@ -40,6 +40,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.BDecoder;
import net.yacy.kelondro.util.BDecoder.BObject;
@ -56,7 +57,7 @@ public class torrentParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(AnchorURL location, String mimeType, String charset, InputStream source)
public Document[] parse(AnchorURL location, String mimeType, String charset, final VocabularyScraper scraper, InputStream source)
throws Parser.Failure, InterruptedException {
byte[] b = null;
try {
@ -119,8 +120,8 @@ public class torrentParser extends AbstractParser implements Parser {
try {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false, false);
Document[] d = parser.parse(new AnchorURL("http://localhost/test.torrent"), null, "UTF-8", new VocabularyScraper(), new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], null, true, true, LibraryProvider.dymLib, false, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (final IOException e) {

View File

@ -46,6 +46,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
/**
* Vcard specification: http://www.imc.org/pdi/vcard-21.txt
@ -65,7 +66,7 @@ public class vcfParser extends AbstractParser implements Parser {
}
@Override
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final InputStream source)
public Document[] parse(final AnchorURL url, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
try {

View File

@ -37,6 +37,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpsf.SummaryInformation;
@ -66,7 +67,7 @@ public class vsdParser extends AbstractParser implements Parser {
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final InputStream source)
public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
Document theDoc = null;

View File

@ -36,6 +36,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
@ -68,7 +69,7 @@ public class xlsParser extends AbstractParser implements Parser {
*/
@Override
public Document[] parse(final AnchorURL location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
final String charset, final VocabularyScraper scraper, final InputStream source) throws Parser.Failure,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);
}

View File

@ -38,6 +38,7 @@ import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -62,7 +63,7 @@ public class zipParser extends AbstractParser implements Parser {
@Override
public Document[] parse(final AnchorURL url, final String mimeType,
final String charset, final InputStream source)
final String charset, final VocabularyScraper scraper, final InputStream source)
throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, false))
@ -89,7 +90,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, 999, tmp);
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, 999, tmp);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

View File

@ -1045,7 +1045,6 @@ public class YaCyDefaultServlet extends HttpServlet {
upload.setFileSizeMax(SIZE_FILE_THRESHOLD);
try {
// Parse the request to get form field items
@SuppressWarnings("unchecked")
List<FileItem> fileItems = upload.parseRequest(request);
// Process the uploaded file items
Iterator<FileItem> i = fileItems.iterator();

View File

@ -418,7 +418,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.depth(), response.getContent());
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.profile().scraper(), response.depth(), response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

View File

@ -2570,6 +2570,7 @@ public final class Switchboard extends serverSwitch {
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().scraper(),
response.depth(),
response.getContent());
if ( documents == null ) {
@ -2750,7 +2751,7 @@ public final class Switchboard extends serverSwitch {
for ( int i = 0; i < in.documents.length; i++ ) {
condenser[i] =
new Condenser(
in.documents[i], in.queueEntry.profile().indexText(),
in.documents[i], in.queueEntry.profile().scraper(), in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, true, this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
@ -3189,7 +3190,7 @@ public final class Switchboard extends serverSwitch {
throw new Parser.Failure("indexing is denied", url);
}
final Condenser condenser = new Condenser(
document, true, true, LibraryProvider.dymLib, true,
document, null, true, true, LibraryProvider.dymLib, true,
Switchboard.this.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.dates_in_content_sxt));
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);

View File

@ -42,6 +42,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.WebgraphConfiguration;
@ -149,7 +150,7 @@ public class DocumentIndex extends Segment {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
}
@ -158,7 +159,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true, true);
final Condenser condenser = new Condenser(document, null, true, true, LibraryProvider.dymLib, true, true);
rows[c++] =
super.storeDocument(
url,

View File

@ -761,7 +761,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, true, true, null, false, false).words().keySet();
words = new Condenser(document, null, true, true, null, false, false).words().keySet();
// delete all word references
int count = 0;