mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
refactoring: better abstraction of reference and metadata prototypes.
This is a preparation to introduce other index tables as used now only for reverse text indexes. Next application of the reverse index is a citation index. Moved to version 0.74 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5777 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
ab656687d7
commit
c2359f20dd
|
@ -3,7 +3,7 @@ javacSource=1.5
|
|||
javacTarget=1.5
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=0.73
|
||||
releaseVersion=0.74
|
||||
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
|
|
|
@ -45,11 +45,11 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import de.anomic.data.AbstractBlacklist;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.DefaultBlacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.kelondro.text.DefaultBlacklist;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
|
|
@ -32,9 +32,9 @@
|
|||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
|
|
@ -38,10 +38,10 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.data.AbstractBlacklist;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
|
|
@ -41,8 +41,7 @@ import de.anomic.data.listManager;
|
|||
import de.anomic.data.userDB;
|
||||
import de.anomic.data.bookmarksDB.Tag;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
|
@ -184,10 +183,10 @@ public class Bookmarks {
|
|||
final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
|
||||
if (bookmark == null) {
|
||||
// try to get the bookmark from the LURL database
|
||||
final MetadataRowContainer urlentry = sb.webIndex.metadata().load(urlHash, null, 0);
|
||||
final URLMetadataRow urlentry = sb.webIndex.metadata().load(urlHash, null, 0);
|
||||
plasmaParserDocument document = null;
|
||||
if (urlentry != null) {
|
||||
final URLMetadata metadata = urlentry.metadata();
|
||||
final URLMetadataRow.Components metadata = urlentry.metadata();
|
||||
document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false);
|
||||
prop.put("mode_edit", "0"); // create mode
|
||||
prop.put("mode_url", metadata.url().toNormalform(false, true));
|
||||
|
|
|
@ -31,8 +31,7 @@ import java.util.Iterator;
|
|||
import java.util.Locale;
|
||||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -170,8 +169,8 @@ public class CrawlResults {
|
|||
String urlHash, initiatorHash, executorHash;
|
||||
String urlstr, urltxt;
|
||||
yacySeed initiatorSeed, executorSeed;
|
||||
MetadataRowContainer urle;
|
||||
URLMetadata metadata;
|
||||
URLMetadataRow urle;
|
||||
URLMetadataRow.Components metadata;
|
||||
|
||||
int i, cnt = 0;
|
||||
for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) {
|
||||
|
|
|
@ -34,21 +34,21 @@ import java.util.HashSet;
|
|||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.data.AbstractBlacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerCache;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.plasma.plasmaSearchAPI;
|
||||
import de.anomic.plasma.plasmaSearchEvent;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyClient;
|
||||
|
@ -126,7 +126,7 @@ public class IndexControlRWIs_p {
|
|||
// generate an urlx array
|
||||
ReferenceContainer index = null;
|
||||
index = sb.webIndex.index().get(keyhash, null);
|
||||
final Iterator<ReferenceRow> en = index.entries();
|
||||
final Iterator<WordReferenceRow> en = index.entries();
|
||||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
while (en.hasNext()) {
|
||||
|
@ -207,11 +207,11 @@ public class IndexControlRWIs_p {
|
|||
final long starttime = System.currentTimeMillis();
|
||||
index = sb.webIndex.index().get(keyhash, null);
|
||||
// built urlCache
|
||||
final Iterator<ReferenceRow> urlIter = index.entries();
|
||||
final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>();
|
||||
final Iterator<WordReferenceRow> urlIter = index.entries();
|
||||
final HashMap<String, URLMetadataRow> knownURLs = new HashMap<String, URLMetadataRow>();
|
||||
final HashSet<String> unknownURLEntries = new HashSet<String>();
|
||||
Reference iEntry;
|
||||
MetadataRowContainer lurl;
|
||||
URLMetadataRow lurl;
|
||||
while (urlIter.hasNext()) {
|
||||
iEntry = urlIter.next();
|
||||
lurl = sb.webIndex.metadata().load(iEntry.urlHash(), null, 0);
|
||||
|
@ -251,7 +251,7 @@ public class IndexControlRWIs_p {
|
|||
prop.put("keyhashsimilar", "1");
|
||||
while (containerIt.hasNext() && i < 256) {
|
||||
container = containerIt.next();
|
||||
prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getWordHash());
|
||||
prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
|
||||
cols++;
|
||||
if (cols==8) {
|
||||
prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
|
||||
|
@ -278,7 +278,7 @@ public class IndexControlRWIs_p {
|
|||
yacyURL url;
|
||||
for (int i=0; i<urlx.length; i++) {
|
||||
urlHashes.add(urlx[i]);
|
||||
final MetadataRowContainer e = sb.webIndex.metadata().load(urlx[i], null, 0);
|
||||
final URLMetadataRow e = sb.webIndex.metadata().load(urlx[i], null, 0);
|
||||
sb.webIndex.metadata().remove(urlx[i]);
|
||||
if (e != null) {
|
||||
url = e.metadata().url();
|
||||
|
@ -306,7 +306,7 @@ public class IndexControlRWIs_p {
|
|||
yacyURL url;
|
||||
for (int i=0; i<urlx.length; i++) {
|
||||
urlHashes.add(urlx[i]);
|
||||
final MetadataRowContainer e = sb.webIndex.metadata().load(urlx[i], null, 0);
|
||||
final URLMetadataRow e = sb.webIndex.metadata().load(urlx[i], null, 0);
|
||||
sb.webIndex.metadata().remove(urlx[i]);
|
||||
if (e != null) {
|
||||
url = e.metadata().url();
|
||||
|
|
|
@ -33,9 +33,8 @@ import java.util.Iterator;
|
|||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.RotateIterator;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -116,7 +115,7 @@ public class IndexControlURLs_p {
|
|||
}
|
||||
|
||||
if (post.containsKey("urlhashdelete")) {
|
||||
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
} else {
|
||||
|
@ -150,7 +149,7 @@ public class IndexControlURLs_p {
|
|||
final yacyURL url = new yacyURL(urlstring, null);
|
||||
urlhash = url.hash();
|
||||
prop.put("urlhash", urlhash);
|
||||
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
if (entry == null) {
|
||||
prop.putHTML("urlstring", "unknown url: " + urlstring);
|
||||
prop.put("urlhash", "");
|
||||
|
@ -167,7 +166,7 @@ public class IndexControlURLs_p {
|
|||
}
|
||||
|
||||
if (post.containsKey("urlhashsearch")) {
|
||||
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash);
|
||||
} else {
|
||||
|
@ -182,9 +181,9 @@ public class IndexControlURLs_p {
|
|||
// generate list
|
||||
if (post.containsKey("urlhashsimilar")) {
|
||||
try {
|
||||
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
|
||||
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
|
||||
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
|
||||
MetadataRowContainer entry;
|
||||
URLMetadataRow entry;
|
||||
int i = 0;
|
||||
int rows = 0, cols = 0;
|
||||
prop.put("urlhashsimilar", "1");
|
||||
|
@ -286,15 +285,15 @@ public class IndexControlURLs_p {
|
|||
return prop;
|
||||
}
|
||||
|
||||
private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final MetadataRowContainer entry, final String urlhash) {
|
||||
private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final URLMetadataRow entry, final String urlhash) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
if (entry == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
return prop;
|
||||
}
|
||||
final URLMetadata metadata = entry.metadata();
|
||||
final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0);
|
||||
final URLMetadataRow.Components metadata = entry.metadata();
|
||||
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0);
|
||||
if (metadata.url() == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
|
|
|
@ -31,11 +31,11 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.index.Row.Entry;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
|
|
@ -31,11 +31,11 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.index.Row.Entry;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
|
|
@ -39,16 +39,15 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
|
|||
import de.anomic.http.httpClient;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.http.httpResponseHeader;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.plasmaCondenser;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.plasmaSnippetCache;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -95,7 +94,7 @@ public class ViewFile {
|
|||
final String urlHash = post.get("urlHash","");
|
||||
if (urlHash.length() > 0) {
|
||||
// getting the urlEntry that belongs to the url hash
|
||||
MetadataRowContainer urlEntry = null;
|
||||
URLMetadataRow urlEntry = null;
|
||||
urlEntry = sb.webIndex.metadata().load(urlHash, null, 0);
|
||||
if (urlEntry == null) {
|
||||
prop.put("error", "2");
|
||||
|
@ -104,7 +103,7 @@ public class ViewFile {
|
|||
}
|
||||
|
||||
// getting the url that belongs to the entry
|
||||
final URLMetadata metadata = urlEntry.metadata();
|
||||
final URLMetadataRow.Components metadata = urlEntry.metadata();
|
||||
if ((metadata == null) || (metadata.url() == null)) {
|
||||
prop.put("error", "3");
|
||||
prop.put("viewMode", VIEW_MODE_NO_TEXT);
|
||||
|
@ -114,7 +113,7 @@ public class ViewFile {
|
|||
descr = metadata.dc_title();
|
||||
urlEntry.wordCount();
|
||||
size = urlEntry.size();
|
||||
pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof);
|
||||
pre = urlEntry.flags().get(Condenser.flag_cat_indexof);
|
||||
}
|
||||
|
||||
// alternatively, get the url simply from a url String
|
||||
|
@ -312,7 +311,7 @@ public class ViewFile {
|
|||
// Search word highlighting
|
||||
while (sentences.hasNext()) {
|
||||
sentence = sentences.next().toString();
|
||||
Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
|
||||
Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
|
||||
while (tokens.hasMoreElements()) {
|
||||
token = tokens.nextElement().toString();
|
||||
if (token.length() > 0) {
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.data.AbstractBlacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.plasmaSearchQuery;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
@ -85,8 +85,8 @@ public final class timeline {
|
|||
localSearchContainerMaps[1].values(),
|
||||
maxdist);
|
||||
|
||||
Iterator<ReferenceRow> i = index.entries();
|
||||
ReferenceRow entry;
|
||||
Iterator<WordReferenceRow> i = index.entries();
|
||||
WordReferenceRow entry;
|
||||
int c = 0;
|
||||
Date lm;
|
||||
String lms;
|
||||
|
|
|
@ -28,8 +28,7 @@
|
|||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
@ -69,14 +68,14 @@ public class yacydoc {
|
|||
}
|
||||
if (urlhash == null || urlhash.length() == 0) return prop;
|
||||
|
||||
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
|
||||
if (entry == null) return prop;
|
||||
|
||||
final URLMetadata metadata = entry.metadata();
|
||||
final URLMetadataRow.Components metadata = entry.metadata();
|
||||
if (metadata.url() == null) {
|
||||
return prop;
|
||||
}
|
||||
final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0);
|
||||
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0);
|
||||
|
||||
prop.putXML("dc_title", metadata.dc_title());
|
||||
prop.putXML("dc_creator", metadata.dc_creator());
|
||||
|
|
|
@ -38,11 +38,11 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
|
||||
import de.anomic.crawler.HTTPLoader;
|
||||
import de.anomic.data.AbstractBlacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.htmlFilter.htmlFilterCharacterCoding;
|
||||
import de.anomic.http.httpClient;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
|
|
@ -31,8 +31,7 @@ import java.io.IOException;
|
|||
|
||||
import de.anomic.crawler.ZURL;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -113,14 +112,14 @@ public final class crawlReceipt {
|
|||
}
|
||||
|
||||
// generating a new loaded URL entry
|
||||
final MetadataRowContainer entry = MetadataRowContainer.importEntry(propStr);
|
||||
final URLMetadataRow entry = URLMetadataRow.importEntry(propStr);
|
||||
if (entry == null) {
|
||||
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
|
||||
prop.put("delay", "3600");
|
||||
return prop;
|
||||
}
|
||||
|
||||
final URLMetadata metadata = entry.metadata();
|
||||
final URLMetadataRow.Components metadata = entry.metadata();
|
||||
if (metadata.url() == null) {
|
||||
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
|
||||
prop.put("delay", "3600");
|
||||
|
|
|
@ -32,9 +32,9 @@ import java.util.HashSet;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
@ -127,7 +127,7 @@ public final class transferRWI {
|
|||
int p;
|
||||
String wordHash;
|
||||
String urlHash;
|
||||
ReferenceRow iEntry;
|
||||
WordReferenceRow iEntry;
|
||||
final HashSet<String> unknownURL = new HashSet<String>();
|
||||
final HashSet<String> knownURL = new HashSet<String>();
|
||||
final String[] wordhashes = new String[v.size()];
|
||||
|
@ -147,7 +147,7 @@ public final class transferRWI {
|
|||
}
|
||||
wordHash = estring.substring(0, p);
|
||||
wordhashes[received] = wordHash;
|
||||
iEntry = new ReferenceRow(estring.substring(p));
|
||||
iEntry = new WordReferenceRow(estring.substring(p));
|
||||
urlHash = iEntry.urlHash();
|
||||
|
||||
// block blacklisted entries
|
||||
|
|
|
@ -29,10 +29,9 @@
|
|||
import java.io.IOException;
|
||||
import java.text.ParseException;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverCore;
|
||||
|
@ -85,7 +84,7 @@ public final class transferURL {
|
|||
final int sizeBefore = sb.webIndex.metadata().size();
|
||||
// read the urls from the other properties and store
|
||||
String urls;
|
||||
MetadataRowContainer lEntry;
|
||||
URLMetadataRow lEntry;
|
||||
for (int i = 0; i < urlc; i++) {
|
||||
serverCore.checkInterruption();
|
||||
|
||||
|
@ -98,7 +97,7 @@ public final class transferURL {
|
|||
}
|
||||
|
||||
// parse new lurl-entry
|
||||
lEntry = MetadataRowContainer.importEntry(urls);
|
||||
lEntry = URLMetadataRow.importEntry(urls);
|
||||
if (lEntry == null) {
|
||||
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
|
||||
blocked++;
|
||||
|
@ -106,7 +105,7 @@ public final class transferURL {
|
|||
}
|
||||
|
||||
// check if entry is well-formed
|
||||
final URLMetadata metadata = lEntry.metadata();
|
||||
final URLMetadataRow.Components metadata = lEntry.metadata();
|
||||
if (metadata.url() == null) {
|
||||
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
|
||||
blocked++;
|
||||
|
|
|
@ -30,8 +30,7 @@ import java.util.Date;
|
|||
import de.anomic.crawler.CrawlEntry;
|
||||
import de.anomic.crawler.NoticedURL;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -109,8 +108,8 @@ public class urls {
|
|||
if (urlhashes.length() % 12 != 0) return prop;
|
||||
final int count = urlhashes.length() / 12;
|
||||
int c = 0;
|
||||
MetadataRowContainer entry;
|
||||
URLMetadata metadata;
|
||||
URLMetadataRow entry;
|
||||
URLMetadataRow.Components metadata;
|
||||
yacyURL referrer;
|
||||
for (int i = 0; i < count; i++) {
|
||||
entry = sb.webIndex.metadata().load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0);
|
||||
|
|
|
@ -33,13 +33,10 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaCondenser;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.plasmaProfiling;
|
||||
import de.anomic.plasma.plasmaSearchEvent;
|
||||
|
@ -48,6 +45,8 @@ import de.anomic.plasma.plasmaSearchRankingProfile;
|
|||
import de.anomic.plasma.plasmaSnippetCache;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverDomains;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -164,7 +163,7 @@ public class yacysearch {
|
|||
Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null;
|
||||
if (indexof) {
|
||||
constraint = new Bitfield(4);
|
||||
constraint.set(plasmaCondenser.flag_cat_indexof, true);
|
||||
constraint.set(Condenser.flag_cat_indexof, true);
|
||||
}
|
||||
|
||||
// SEARCH
|
||||
|
@ -342,9 +341,9 @@ public class yacysearch {
|
|||
return prop;
|
||||
}
|
||||
final String recommendHash = post.get("recommendref", ""); // urlhash
|
||||
final MetadataRowContainer urlentry = sb.webIndex.metadata().load(recommendHash, null, 0);
|
||||
final URLMetadataRow urlentry = sb.webIndex.metadata().load(recommendHash, null, 0);
|
||||
if (urlentry != null) {
|
||||
final URLMetadata metadata = urlentry.metadata();
|
||||
final URLMetadataRow.Components metadata = urlentry.metadata();
|
||||
plasmaParserDocument document;
|
||||
document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false);
|
||||
if (document != null) {
|
||||
|
|
|
@ -38,13 +38,13 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
|
||||
import de.anomic.http.httpClient;
|
||||
import de.anomic.kelondro.table.FlexWidthArray;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.server.serverProcessorJob;
|
||||
import de.anomic.xml.RSSFeed;
|
||||
import de.anomic.xml.RSSMessage;
|
||||
|
|
|
@ -31,8 +31,8 @@ package de.anomic.crawler;
|
|||
import java.net.UnknownHostException;
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
|
@ -244,7 +244,7 @@ public final class CrawlStacker {
|
|||
// check if the url is double registered
|
||||
final String dbocc = nextQueue.urlExists(entry.url().hash());
|
||||
if (dbocc != null || wordIndex.metadata().exists(entry.url().hash())) {
|
||||
final MetadataRowContainer oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0);
|
||||
final URLMetadataRow oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0);
|
||||
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
|
||||
// do double-check
|
||||
if ((dbocc != null) && (!recrawl)) {
|
||||
|
|
|
@ -35,13 +35,13 @@ import java.util.Date;
|
|||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.http.httpResponseHeader;
|
||||
import de.anomic.http.httpdProxyCacheEntry;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.net.ftpc;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class FTPLoader {
|
||||
|
|
|
@ -28,17 +28,17 @@ package de.anomic.crawler;
|
|||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.httpClient;
|
||||
import de.anomic.http.httpResponse;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.http.httpResponseHeader;
|
||||
import de.anomic.http.httpdProxyCacheEntry;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public final class HTTPLoader {
|
||||
|
|
|
@ -39,7 +39,7 @@ import de.anomic.kelondro.index.Row;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.table.Stack;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
|
@ -352,7 +352,7 @@ public class IndexingStack {
|
|||
if (referrerURL == null) {
|
||||
// FIXME the equals seems to be incorrect: String.equals(boolean)
|
||||
if ((referrerHash == null) || ((initiator != null) && (referrerHash.equals(initiator.length() == 0)))) return null;
|
||||
final MetadataRowContainer entry = wordIndex.metadata().load(referrerHash, null, 0);
|
||||
final URLMetadataRow entry = wordIndex.metadata().load(referrerHash, null, 0);
|
||||
if (entry == null) referrerURL = null; else referrerURL = entry.metadata().url();
|
||||
}
|
||||
return referrerURL;
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
package de.anomic.crawler;
|
||||
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.server.serverSemaphore;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
|
|
@ -33,9 +33,9 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverProcessorJob;
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ import java.util.LinkedList;
|
|||
import java.util.List;
|
||||
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
@ -82,7 +82,7 @@ public final class ResultURLs {
|
|||
gcrawlResultDomains = new ScoreCluster<String>();
|
||||
}
|
||||
|
||||
public synchronized void stack(final MetadataRowContainer e, final String initiatorHash, final String executorHash, final int stackType) {
|
||||
public synchronized void stack(final URLMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) {
|
||||
assert initiatorHash != null;
|
||||
assert executorHash != null;
|
||||
if (e == null) { return; }
|
||||
|
@ -305,7 +305,7 @@ public final class ResultURLs {
|
|||
final ResultURLs results = new ResultURLs();
|
||||
try {
|
||||
final yacyURL url = new yacyURL("http", "www.yacy.net", 80, "/");
|
||||
final MetadataRowContainer urlRef = new MetadataRowContainer(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
|
||||
final URLMetadataRow urlRef = new URLMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
|
||||
int stackNo = 1;
|
||||
System.out.println("valid test:\n=======");
|
||||
// add
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException;
|
|||
|
||||
|
||||
|
||||
|
||||
public class DefaultBlacklist extends AbstractBlacklist implements Blacklist {
|
||||
|
||||
public DefaultBlacklist(final File rootPath) {
|
|
@ -45,7 +45,7 @@ import de.anomic.http.httpClient;
|
|||
import de.anomic.http.httpResponse;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.http.httpdByteCountInputStream;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
@ -260,7 +260,7 @@ public class SitemapParser extends DefaultHandler {
|
|||
final String dbocc = this.sb.urlExists(nexturlhash);
|
||||
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
|
||||
// the url was already loaded. we need to check the date
|
||||
final MetadataRowContainer oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0);
|
||||
final URLMetadataRow oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0);
|
||||
if (oldEntry != null) {
|
||||
final Date modDate = oldEntry.moddate();
|
||||
// check if modDate is null
|
||||
|
|
|
@ -55,9 +55,9 @@ import de.anomic.kelondro.index.IntegerHandleIndex;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.IndexCollection;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository.Export;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
@ -396,7 +396,7 @@ public class URLAnalysis {
|
|||
"collection",
|
||||
12,
|
||||
Base64Order.enhancedCoder,
|
||||
ReferenceRow.urlEntryRow);
|
||||
WordReferenceRow.urlEntryRow);
|
||||
System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics");
|
||||
idx.dump(new File(statisticPath));
|
||||
System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
|
||||
|
@ -407,9 +407,9 @@ public class URLAnalysis {
|
|||
|
||||
public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
|
||||
System.out.println("COLLECTION INDEX DIFF URL-COL startup");
|
||||
IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile), 0);
|
||||
IntegerHandleIndex idx = new IntegerHandleIndex(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(statisticFile), 0);
|
||||
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 0, 1000000);
|
||||
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000);
|
||||
System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff");
|
||||
long start = System.currentTimeMillis();
|
||||
long update = start - 7000;
|
||||
|
@ -436,7 +436,7 @@ public class URLAnalysis {
|
|||
// format: 0=text, 1=html, 2=rss/xml
|
||||
System.out.println("URL EXPORT startup");
|
||||
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
HandleSet hs = (diffFile == null) ? null : new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0);
|
||||
HandleSet hs = (diffFile == null) ? null : new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
|
||||
System.out.println("URL EXPORT loaded dump, starting export");
|
||||
Export e = mr.export(new File(export), ".*", hs, format, false);
|
||||
try {
|
||||
|
@ -451,7 +451,7 @@ public class URLAnalysis {
|
|||
System.out.println("URL DELETE startup");
|
||||
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
||||
int mrSize = mr.size();
|
||||
HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0);
|
||||
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
|
||||
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
|
||||
for (byte[] refhash: hs) {
|
||||
mr.remove(new String(refhash));
|
||||
|
|
|
@ -68,12 +68,12 @@ import de.anomic.kelondro.blob.BLOBTree;
|
|||
import de.anomic.kelondro.blob.MapView;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.kelondroException;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.server.serverBusyThread;
|
||||
import de.anomic.server.serverInstantBusyThread;
|
||||
import de.anomic.yacy.yacyNewsPool;
|
||||
|
|
|
@ -42,8 +42,7 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.Vector;
|
||||
|
||||
import de.anomic.kelondro.text.AbstractBlacklist;
|
||||
import de.anomic.kelondro.text.Blacklist.blacklistFile;
|
||||
import de.anomic.data.Blacklist.blacklistFile;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverCore;
|
||||
|
||||
|
|
|
@ -29,9 +29,9 @@ package de.anomic.http;
|
|||
import java.util.Date;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class httpdProxyCacheEntry implements Document {
|
||||
|
|
|
@ -72,10 +72,9 @@ import java.util.logging.Logger;
|
|||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import de.anomic.crawler.HTTPLoader;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.htmlFilter.htmlFilterContentTransformer;
|
||||
import de.anomic.htmlFilter.htmlFilterTransformer;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
|
@ -83,6 +82,7 @@ import de.anomic.plasma.plasmaHTCache;
|
|||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverDomains;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
|
|
@ -40,13 +40,13 @@ import de.anomic.http.httpChunkedInputStream;
|
|||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.http.httpResponseHeader;
|
||||
import de.anomic.http.httpdProxyCacheEntry;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverHandler;
|
||||
import de.anomic.server.serverCore.Session;
|
||||
|
|
|
@ -609,30 +609,30 @@ public class BLOBArray implements BLOB {
|
|||
while (true) {
|
||||
assert c1 != null;
|
||||
assert c2 != null;
|
||||
e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes());
|
||||
e = ordering.compare(c1.getTermHash().getBytes(), c2.getTermHash().getBytes());
|
||||
if (e < 0) {
|
||||
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
|
||||
writer.add(c1.getTermHash().getBytes(), c1.exportCollection());
|
||||
if (i1.hasNext()) {
|
||||
c1o = c1;
|
||||
c1 = i1.next();
|
||||
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
|
||||
assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (e > 0) {
|
||||
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
|
||||
writer.add(c2.getTermHash().getBytes(), c2.exportCollection());
|
||||
if (i2.hasNext()) {
|
||||
c2o = c2;
|
||||
c2 = i2.next();
|
||||
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
|
||||
assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert e == 0;
|
||||
// merge the entries
|
||||
writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection());
|
||||
writer.add(c1.getTermHash().getBytes(), (c1.merge(c2)).exportCollection());
|
||||
if (i1.hasNext() && i2.hasNext()) {
|
||||
c1 = i1.next();
|
||||
c2 = i2.next();
|
||||
|
@ -647,22 +647,22 @@ public class BLOBArray implements BLOB {
|
|||
assert !(i1.hasNext() && i2.hasNext());
|
||||
while (i1.hasNext()) {
|
||||
//System.out.println("FLUSH REMAINING 1: " + c1.getWordHash());
|
||||
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
|
||||
writer.add(c1.getTermHash().getBytes(), c1.exportCollection());
|
||||
if (i1.hasNext()) {
|
||||
c1o = c1;
|
||||
c1 = i1.next();
|
||||
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
|
||||
assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
while (i2.hasNext()) {
|
||||
//System.out.println("FLUSH REMAINING 2: " + c2.getWordHash());
|
||||
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
|
||||
writer.add(c2.getTermHash().getBytes(), c2.exportCollection());
|
||||
if (i2.hasNext()) {
|
||||
c2o = c2;
|
||||
c2 = i2.next();
|
||||
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
|
||||
assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -44,7 +44,7 @@ import de.anomic.kelondro.text.IndexBuffer;
|
|||
import de.anomic.kelondro.text.IndexCollection;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerOrder;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
|
@ -94,21 +94,21 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
|
|||
12,
|
||||
Base64Order.enhancedCoder,
|
||||
maxCollectionPartition,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
useCommons);
|
||||
}
|
||||
|
||||
/* methods for interface Index */
|
||||
|
||||
public void add(final ReferenceContainer entries) {
|
||||
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
|
||||
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
|
||||
|
||||
// add the entry
|
||||
buffer.add(entries);
|
||||
cacheFlushControl();
|
||||
}
|
||||
|
||||
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
|
||||
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
|
||||
// add the entry
|
||||
buffer.add(wordHash, entry);
|
||||
cacheFlushControl();
|
||||
|
@ -151,10 +151,10 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
|
|||
for (int i = 0; i < d.size(); i++) {
|
||||
// for each element in the double-set, take that one that is the most recent one
|
||||
set = d.get(i);
|
||||
ReferenceRow e, elm = null;
|
||||
WordReferenceRow e, elm = null;
|
||||
long lm = 0;
|
||||
for (int j = 0; j < set.size(); j++) {
|
||||
e = new ReferenceRow(set.get(j, true));
|
||||
e = new WordReferenceRow(set.get(j, true));
|
||||
if ((elm == null) || (e.lastModified() > lm)) {
|
||||
elm = e;
|
||||
lm = e.lastModified();
|
||||
|
@ -164,7 +164,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
|
|||
container.addUnique(elm.toKelondroEntry());
|
||||
}
|
||||
}
|
||||
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
|
||||
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHash());
|
||||
|
||||
return container;
|
||||
}
|
||||
|
@ -172,7 +172,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
|
|||
public ReferenceContainer delete(final String wordHash) {
|
||||
final ReferenceContainer c = new ReferenceContainer(
|
||||
wordHash,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
buffer.count(wordHash));
|
||||
c.addAllUnique(buffer.delete(wordHash));
|
||||
c.addAllUnique(collections.delete(wordHash));
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.kelondro.order.ByteOrder;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
|
||||
public interface Index {
|
||||
|
||||
|
@ -52,72 +53,72 @@ public interface Index {
|
|||
* if no references to the word are stored, the a new entry is added,
|
||||
* if there are already references to the word hash stored,
|
||||
* then the old and the new references are merged
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @param entry
|
||||
* @throws IOException
|
||||
*/
|
||||
public void add(final String wordHash, final ReferenceRow entry) throws IOException;
|
||||
public void add(final String termHash, final WordReferenceRow entry) throws IOException;
|
||||
|
||||
/**
|
||||
* check if there are references stored to the given word hash
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @return true if references exist, false if not
|
||||
*/
|
||||
public boolean has(String wordHash); // should only be used if in case that true is returned the getContainer is NOT called
|
||||
public boolean has(String termHash); // should only be used if in case that true is returned the getContainer is NOT called
|
||||
|
||||
/**
|
||||
* count the number of references for the given word
|
||||
* do not use this method to check the existence of a reference by comparing
|
||||
* the result with zero, use hasReferences instead.
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @return the number of references to the given word
|
||||
*/
|
||||
public int count(final String wordHash);
|
||||
public int count(final String termHash);
|
||||
|
||||
/**
|
||||
* get the references to a given word.
|
||||
* if referenceselection is not null, then all url references which are not
|
||||
* in referenceselection are removed from the container
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @param referenceselection
|
||||
* @return the references
|
||||
* @throws IOException
|
||||
*/
|
||||
public ReferenceContainer get(String wordHash, Set<String> referenceselection) throws IOException;
|
||||
public ReferenceContainer get(String termHash, Set<String> referenceselection) throws IOException;
|
||||
|
||||
/**
|
||||
* delete all references for a word
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @return the deleted references
|
||||
* @throws IOException
|
||||
*/
|
||||
public ReferenceContainer delete(String wordHash) throws IOException;
|
||||
public ReferenceContainer delete(String termHash) throws IOException;
|
||||
|
||||
/**
|
||||
* remove a specific reference entry
|
||||
* @param wordHash
|
||||
* @param termHash
|
||||
* @param referenceHash the key for the reference entry to be removed
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public boolean remove(String wordHash, String referenceHash) throws IOException;
|
||||
public boolean remove(String termHash, String referenceHash) throws IOException;
|
||||
|
||||
/**
|
||||
* remove a set of reference entries for a given word
|
||||
* @param wordHash the key for the references
|
||||
* @param termHash the key for the references
|
||||
* @param referenceHash the reference entry keys
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public int remove(String wordHash, Set<String> referenceHashes) throws IOException;
|
||||
public int remove(String termHash, Set<String> referenceHashes) throws IOException;
|
||||
|
||||
public int remove(final Set<String> wordHashes, final String urlHash) throws IOException;
|
||||
public int remove(final Set<String> termHashes, final String urlHash) throws IOException;
|
||||
|
||||
public void remove(final Set<String> wordHashes, final Set<String> urlHashes) throws IOException;
|
||||
public void remove(final Set<String> termHashes, final Set<String> urlHashes) throws IOException;
|
||||
|
||||
/**
|
||||
* iterate all references from the beginning of a specific word hash
|
||||
* @param startWordHash
|
||||
* @param startHash
|
||||
* @param rot if true, then rotate at the end to the beginning
|
||||
* @param ram
|
||||
* @return
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.util.Set;
|
|||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.order.ByteOrder;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
|
@ -94,8 +95,8 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
|
|||
} else if (dumpFile.exists()) {
|
||||
// initialize scores for cache organization
|
||||
for (final ReferenceContainer ic : (Iterable<ReferenceContainer>) heap.references(null, false)) {
|
||||
this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote()));
|
||||
this.hashScore.setScore(ic.getWordHash(), ic.size());
|
||||
this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote()));
|
||||
this.hashScore.setScore(ic.getTermHash(), ic.size());
|
||||
}
|
||||
} else {
|
||||
heap.initWriteMode();
|
||||
|
@ -197,7 +198,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
|
|||
}
|
||||
if (hash == null) {
|
||||
final ReferenceContainer ic = heap.references(null, false).next();
|
||||
if (ic != null) hash = ic.getWordHash();
|
||||
if (ic != null) hash = ic.getTermHash();
|
||||
}
|
||||
return hash;
|
||||
|
||||
|
@ -304,11 +305,11 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
|
|||
|
||||
// put new words into cache
|
||||
heap.add(container);
|
||||
hashScore.setScore(container.getWordHash(), heap.count(container.getWordHash()));
|
||||
hashDate.setScore(container.getWordHash(), intTime(System.currentTimeMillis()));
|
||||
hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash()));
|
||||
hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis()));
|
||||
}
|
||||
|
||||
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
|
||||
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
|
||||
if (entry == null || heap == null) return;
|
||||
|
||||
// put new words into cache
|
||||
|
@ -335,7 +336,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
|
|||
public synchronized long getBufferSizeBytes() {
|
||||
// calculate the real size in bytes of the index cache
|
||||
long cacheBytes = 0;
|
||||
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
|
||||
final long entryBytes = WordReferenceRow.urlEntryRow.objectsize;
|
||||
final Iterator<ReferenceContainer> it = references(null, false);
|
||||
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
|
||||
return cacheBytes;
|
||||
|
|
|
@ -36,6 +36,7 @@ import de.anomic.kelondro.order.ByteOrder;
|
|||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.order.MergeIterator;
|
||||
import de.anomic.kelondro.order.Order;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.server.serverProfiling;
|
||||
|
||||
|
@ -65,15 +66,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
|
||||
public IndexCell(
|
||||
final File cellPath,
|
||||
final ByteOrder wordOrder,
|
||||
final ByteOrder termOrder,
|
||||
final Row payloadrow,
|
||||
final int maxRamEntries,
|
||||
final long targetFileSize,
|
||||
final long maxFileSize,
|
||||
IODispatcher merger
|
||||
) throws IOException {
|
||||
this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow, merger);
|
||||
this.ram = new ReferenceContainerCache(payloadrow, wordOrder);
|
||||
this.array = new ReferenceContainerArray(cellPath, termOrder, payloadrow, merger);
|
||||
this.ram = new ReferenceContainerCache(payloadrow, termOrder);
|
||||
this.ram.initWriteMode();
|
||||
this.maxRamEntries = maxRamEntries;
|
||||
this.merger = merger;
|
||||
|
@ -99,25 +100,25 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
cleanCache();
|
||||
}
|
||||
|
||||
public synchronized void add(String hash, ReferenceRow entry) throws IOException {
|
||||
public synchronized void add(String hash, WordReferenceRow entry) throws IOException {
|
||||
this.ram.add(hash, entry);
|
||||
serverProfiling.update("wordcache", Long.valueOf(this.ram.size()), true);
|
||||
cleanCache();
|
||||
}
|
||||
|
||||
/**
|
||||
* checks if there is any container for this wordHash, either in RAM or any BLOB
|
||||
* checks if there is any container for this termHash, either in RAM or any BLOB
|
||||
*/
|
||||
public boolean has(String wordHash) {
|
||||
if (this.ram.has(wordHash)) return true;
|
||||
return this.array.has(wordHash);
|
||||
public boolean has(String termHash) {
|
||||
if (this.ram.has(termHash)) return true;
|
||||
return this.array.has(termHash);
|
||||
}
|
||||
|
||||
public int count(String wordHash) {
|
||||
ReferenceContainer c0 = this.ram.get(wordHash, null);
|
||||
public int count(String termHash) {
|
||||
ReferenceContainer c0 = this.ram.get(termHash, null);
|
||||
ReferenceContainer c1;
|
||||
try {
|
||||
c1 = this.array.get(wordHash);
|
||||
c1 = this.array.get(termHash);
|
||||
} catch (IOException e) {
|
||||
c1 = null;
|
||||
}
|
||||
|
@ -133,9 +134,9 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
* all containers in the BLOBs and the RAM are merged and returned
|
||||
* @throws IOException
|
||||
*/
|
||||
public ReferenceContainer get(String wordHash, Set<String> urlselection) throws IOException {
|
||||
ReferenceContainer c0 = this.ram.get(wordHash, null);
|
||||
ReferenceContainer c1 = this.array.get(wordHash);
|
||||
public ReferenceContainer get(String termHash, Set<String> urlselection) throws IOException {
|
||||
ReferenceContainer c0 = this.ram.get(termHash, null);
|
||||
ReferenceContainer c1 = this.array.get(termHash);
|
||||
if (c1 == null) {
|
||||
if (c0 == null) return null;
|
||||
return c0;
|
||||
|
@ -149,14 +150,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
* the deleted containers are merged and returned as result of the method
|
||||
* @throws IOException
|
||||
*/
|
||||
public ReferenceContainer delete(String wordHash) throws IOException {
|
||||
ReferenceContainer c0 = this.ram.delete(wordHash);
|
||||
ReferenceContainer c1 = this.array.get(wordHash);
|
||||
public ReferenceContainer delete(String termHash) throws IOException {
|
||||
ReferenceContainer c0 = this.ram.delete(termHash);
|
||||
ReferenceContainer c1 = this.array.get(termHash);
|
||||
if (c1 == null) {
|
||||
if (c0 == null) return null;
|
||||
return c0;
|
||||
}
|
||||
this.array.delete(wordHash);
|
||||
this.array.delete(termHash);
|
||||
cleanCache();
|
||||
if (c0 == null) return c1;
|
||||
return c1.merge(c0);
|
||||
|
@ -169,13 +170,13 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
* new BLOBs. This returns the sum of all url references that have been removed
|
||||
* @throws IOException
|
||||
*/
|
||||
public int remove(String wordHash, Set<String> urlHashes) throws IOException {
|
||||
int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHashes));
|
||||
public int remove(String termHash, Set<String> urlHashes) throws IOException {
|
||||
int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashes));
|
||||
return reduced / this.array.rowdef().objectsize;
|
||||
}
|
||||
|
||||
public boolean remove(String wordHash, String urlHash) throws IOException {
|
||||
int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHash));
|
||||
public boolean remove(String termHash, String urlHash) throws IOException {
|
||||
int reduced = this.array.replace(termHash, new RemoveRewriter(urlHash));
|
||||
return reduced > 0;
|
||||
}
|
||||
|
||||
|
@ -199,14 +200,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
|
||||
}
|
||||
|
||||
public CloneableIterator<ReferenceContainer> references(String startWordHash, boolean rot) {
|
||||
public CloneableIterator<ReferenceContainer> references(String starttermHash, boolean rot) {
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone());
|
||||
containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0));
|
||||
containerOrder.rotate(new ReferenceContainer(starttermHash, this.ram.rowdef(), 0));
|
||||
return new MergeIterator<ReferenceContainer>(
|
||||
this.ram.references(startWordHash, rot),
|
||||
this.ram.references(starttermHash, rot),
|
||||
new MergeIterator<ReferenceContainer>(
|
||||
this.ram.references(startWordHash, false),
|
||||
this.array.wordContainerIterator(startWordHash, false, false),
|
||||
this.ram.references(starttermHash, false),
|
||||
this.array.wordContainerIterator(starttermHash, false, false),
|
||||
containerOrder,
|
||||
ReferenceContainer.containerMergeMethod,
|
||||
true),
|
||||
|
@ -215,15 +216,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
true);
|
||||
}
|
||||
|
||||
public CloneableIterator<ReferenceContainer> references(String startWordHash, boolean rot, boolean ram) {
|
||||
public CloneableIterator<ReferenceContainer> references(String startTermHash, boolean rot, boolean ram) {
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone());
|
||||
containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0));
|
||||
containerOrder.rotate(new ReferenceContainer(startTermHash, this.ram.rowdef(), 0));
|
||||
if (ram) {
|
||||
return this.ram.references(startWordHash, rot);
|
||||
return this.ram.references(startTermHash, rot);
|
||||
}
|
||||
return new MergeIterator<ReferenceContainer>(
|
||||
this.ram.references(startWordHash, false),
|
||||
this.array.wordContainerIterator(startWordHash, false, false),
|
||||
this.ram.references(startTermHash, false),
|
||||
this.array.wordContainerIterator(startTermHash, false, false),
|
||||
containerOrder,
|
||||
ReferenceContainer.containerMergeMethod,
|
||||
true);
|
||||
|
@ -317,27 +318,22 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
|
|||
return System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
public int getBufferMaxReferences() {
|
||||
return this.ram.maxReferences();
|
||||
}
|
||||
|
||||
|
||||
public long getBufferMinAge() {
|
||||
return System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
public int getBufferSize() {
|
||||
return this.ram.size();
|
||||
}
|
||||
|
||||
|
||||
public long getBufferSizeBytes() {
|
||||
return 10000 * this.ram.size(); // guessed; we don't know that exactly because there is no statistics here (expensive, not necessary)
|
||||
}
|
||||
|
||||
|
||||
public void setBufferMaxWordCount(int maxWords) {
|
||||
this.maxRamEntries = maxWords;
|
||||
}
|
||||
|
|
|
@ -54,6 +54,7 @@ import de.anomic.kelondro.order.RotateIterator;
|
|||
import de.anomic.kelondro.table.EcoTable;
|
||||
import de.anomic.kelondro.table.FixedWidthArray;
|
||||
import de.anomic.kelondro.table.FlexTable;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.kelondroException;
|
||||
|
@ -250,10 +251,12 @@ public class IndexCollection extends AbstractIndex implements Index {
|
|||
}
|
||||
}
|
||||
|
||||
public void add(String wordhash, ReferenceRow entry) {
|
||||
public void add(String wordhash, WordReferenceRow entry) {
|
||||
if (entry == null) return;
|
||||
try {
|
||||
this.merge(new ReferenceContainer(wordhash, entry));
|
||||
ReferenceContainer container = new ReferenceContainer(wordhash, this.payloadrow, 1);
|
||||
container.add(entry);
|
||||
this.merge(container);
|
||||
} catch (final kelondroOutOfLimitsException e) {
|
||||
e.printStackTrace();
|
||||
} catch (final IOException e) {
|
||||
|
@ -704,7 +707,7 @@ public class IndexCollection extends AbstractIndex implements Index {
|
|||
|
||||
private synchronized void merge(final ReferenceContainer container) throws IOException, kelondroOutOfLimitsException {
|
||||
if ((container == null) || (container.size() == 0)) return;
|
||||
final byte[] key = container.getWordHash().getBytes();
|
||||
final byte[] key = container.getTermHash().getBytes();
|
||||
|
||||
// first find an old entry, if one exists
|
||||
Row.Entry indexrow = index.get(key);
|
||||
|
|
|
@ -41,7 +41,7 @@ import de.anomic.kelondro.text.Index;
|
|||
import de.anomic.kelondro.text.IndexCollection;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerOrder;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
|
||||
|
@ -66,7 +66,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
|
|||
this.cell = new IndexCell(
|
||||
celldir,
|
||||
wordOrdering,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
entityCacheMaxSize,
|
||||
targetFileSize,
|
||||
maxFileSize,
|
||||
|
@ -104,7 +104,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
|
|||
12,
|
||||
Base64Order.enhancedCoder,
|
||||
BufferedIndexCollection.maxCollectionPartition,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
false);
|
||||
if (this.collections.size() == 0) {
|
||||
// delete everything here
|
||||
|
@ -126,10 +126,10 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
|
|||
/* methods for interface Index */
|
||||
|
||||
public void add(final ReferenceContainer entries) throws IOException {
|
||||
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
|
||||
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
|
||||
|
||||
if (this.collections != null) {
|
||||
ReferenceContainer e = this.collections.delete(entries.getWordHash());
|
||||
ReferenceContainer e = this.collections.delete(entries.getTermHash());
|
||||
if (e != null) {
|
||||
e.merge(entries);
|
||||
cell.add(e);
|
||||
|
@ -141,7 +141,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
|
|||
}
|
||||
}
|
||||
|
||||
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
|
||||
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
|
||||
if (this.collections != null) {
|
||||
ReferenceContainer e = this.collections.delete(wordHash);
|
||||
if (e != null) {
|
||||
|
|
89
source/de/anomic/kelondro/text/Metadata.java
Normal file
89
source/de/anomic/kelondro/text/Metadata.java
Normal file
|
@ -0,0 +1,89 @@
|
|||
// Metadata.java
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 03.04.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.crawler.CrawlEntry;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
|
||||
public interface Metadata {
|
||||
|
||||
|
||||
public Row.Entry toRowEntry();
|
||||
|
||||
public String hash();
|
||||
|
||||
public long ranking();
|
||||
|
||||
public Date moddate();
|
||||
|
||||
public Date loaddate();
|
||||
|
||||
public Date freshdate();
|
||||
|
||||
public String referrerHash();
|
||||
|
||||
public String md5();
|
||||
|
||||
public char doctype();
|
||||
|
||||
public String language();
|
||||
|
||||
public int size();
|
||||
|
||||
public Bitfield flags();
|
||||
|
||||
public int wordCount();
|
||||
|
||||
public int llocal();
|
||||
|
||||
public int lother();
|
||||
|
||||
public int limage();
|
||||
|
||||
public int laudio();
|
||||
|
||||
public int lvideo();
|
||||
|
||||
public int lapp();
|
||||
|
||||
public String snippet();
|
||||
|
||||
public Reference word();
|
||||
|
||||
public boolean isOlder(final Metadata other);
|
||||
|
||||
public String toString(final String snippet);
|
||||
|
||||
public CrawlEntry toBalancerEntry(final String initiatorHash);
|
||||
|
||||
public String toString();
|
||||
|
||||
}
|
|
@ -38,6 +38,7 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.htmlFilter.htmlFilterCharacterCoding;
|
||||
import de.anomic.http.httpClient;
|
||||
import de.anomic.http.httpResponse;
|
||||
|
@ -48,6 +49,7 @@ import de.anomic.kelondro.index.Row;
|
|||
import de.anomic.kelondro.index.ObjectIndex;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.table.SplitTable;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -62,7 +64,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
|
||||
public MetadataRepository(final File path) {
|
||||
this.location = path;
|
||||
this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false));
|
||||
this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", URLMetadataRow.rowdef, false));
|
||||
this.exportthread = null; // will have a export thread assigned if exporter is running
|
||||
this.statsDump = null;
|
||||
|
||||
|
@ -97,7 +99,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
public synchronized MetadataRowContainer load(final String urlHash, final Reference searchedWord, final long ranking) {
|
||||
public synchronized URLMetadataRow load(final String urlHash, final Reference searchedWord, final long ranking) {
|
||||
// generates an plasmaLURLEntry using the url hash
|
||||
// if the url cannot be found, this returns null
|
||||
if (urlHash == null) return null;
|
||||
|
@ -105,15 +107,15 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
try {
|
||||
final Row.Entry entry = urlIndexFile.get(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new MetadataRowContainer(entry, searchedWord, ranking);
|
||||
return new URLMetadataRow(entry, searchedWord, ranking);
|
||||
} catch (final IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void store(final MetadataRowContainer entry) throws IOException {
|
||||
public synchronized void store(final URLMetadataRow entry) throws IOException {
|
||||
// Check if there is a more recent Entry already in the DB
|
||||
MetadataRowContainer oldEntry;
|
||||
URLMetadataRow oldEntry;
|
||||
try {
|
||||
if (exists(entry.hash())) {
|
||||
oldEntry = load(entry.hash(), null, 0);
|
||||
|
@ -166,17 +168,17 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
return keys(true, null);
|
||||
}
|
||||
|
||||
public CloneableIterator<MetadataRowContainer> entries() throws IOException {
|
||||
public CloneableIterator<URLMetadataRow> entries() throws IOException {
|
||||
// enumerates entry elements
|
||||
return new kiter();
|
||||
}
|
||||
|
||||
public CloneableIterator<MetadataRowContainer> entries(final boolean up, final String firstHash) throws IOException {
|
||||
public CloneableIterator<URLMetadataRow> entries(final boolean up, final String firstHash) throws IOException {
|
||||
// enumerates entry elements
|
||||
return new kiter(up, firstHash);
|
||||
}
|
||||
|
||||
public class kiter implements CloneableIterator<MetadataRowContainer> {
|
||||
public class kiter implements CloneableIterator<URLMetadataRow> {
|
||||
// enumerates entry elements
|
||||
private final Iterator<Row.Entry> iter;
|
||||
private final boolean error;
|
||||
|
@ -208,12 +210,12 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
return this.iter.hasNext();
|
||||
}
|
||||
|
||||
public final MetadataRowContainer next() {
|
||||
public final URLMetadataRow next() {
|
||||
Row.Entry e = null;
|
||||
if (this.iter == null) { return null; }
|
||||
if (this.iter.hasNext()) { e = this.iter.next(); }
|
||||
if (e == null) { return null; }
|
||||
return new MetadataRowContainer(e, null, 0);
|
||||
return new URLMetadataRow(e, null, 0);
|
||||
}
|
||||
|
||||
public final void remove() {
|
||||
|
@ -232,7 +234,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
final Log log = new Log("URLDBCLEANUP");
|
||||
final HashSet<String> damagedURLS = new HashSet<String>();
|
||||
try {
|
||||
final Iterator<MetadataRowContainer> eiter = entries(true, null);
|
||||
final Iterator<URLMetadataRow> eiter = entries(true, null);
|
||||
int iteratorCount = 0;
|
||||
while (eiter.hasNext()) try {
|
||||
eiter.next();
|
||||
|
@ -325,7 +327,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
public void run() {
|
||||
try {
|
||||
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
|
||||
final Iterator<MetadataRowContainer> eiter = entries(true, null);
|
||||
final Iterator<URLMetadataRow> eiter = entries(true, null);
|
||||
while (eiter.hasNext() && run) {
|
||||
synchronized (this) {
|
||||
if (this.pause) {
|
||||
|
@ -338,13 +340,13 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
}
|
||||
}
|
||||
}
|
||||
final MetadataRowContainer entry = eiter.next();
|
||||
final URLMetadataRow entry = eiter.next();
|
||||
if (entry == null) {
|
||||
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
|
||||
} else if (entry.hash() == null) {
|
||||
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + "hash == null");
|
||||
} else {
|
||||
final URLMetadata metadata = entry.metadata();
|
||||
final URLMetadataRow.Components metadata = entry.metadata();
|
||||
totalSearchedUrls++;
|
||||
if (metadata.url() == null) {
|
||||
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null");
|
||||
|
@ -468,9 +470,9 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
count++;
|
||||
}
|
||||
} else {
|
||||
final Iterator<MetadataRowContainer> i = entries(); // iterates indexURLEntry objects
|
||||
MetadataRowContainer entry;
|
||||
URLMetadata metadata;
|
||||
final Iterator<URLMetadataRow> i = entries(); // iterates indexURLEntry objects
|
||||
URLMetadataRow entry;
|
||||
URLMetadataRow.Components metadata;
|
||||
String url;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
|
@ -552,7 +554,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
HashMap<String, hashStat> map = domainSampleCollector();
|
||||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
MetadataRowContainer urlref;
|
||||
URLMetadataRow urlref;
|
||||
if (count < 0 || count > map.size()) count = map.size();
|
||||
statsDump = new ArrayList<hostStat>();
|
||||
TreeSet<String> set = new TreeSet<String>();
|
||||
|
@ -582,12 +584,12 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
Iterator<String> j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first)
|
||||
MetadataRowContainer urlref;
|
||||
URLMetadataRow urlref;
|
||||
String urlhash;
|
||||
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
|
||||
if (count < 0 || count > s.size()) count = s.size();
|
||||
statsDump = new ArrayList<hostStat>();
|
||||
URLMetadata comps;
|
||||
URLMetadataRow.Components comps;
|
||||
yacyURL url;
|
||||
while (j.hasNext()) {
|
||||
urlhash = j.next();
|
||||
|
|
|
@ -30,16 +30,6 @@ import de.anomic.kelondro.order.Bitfield;
|
|||
|
||||
public interface Reference {
|
||||
|
||||
// appearance flags, used in RWI entry
|
||||
// some names are derived from the Dublin Core Metadata tag set
|
||||
// the flags 0..23 are identical to the category flags in plasmaCondenser
|
||||
public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
|
||||
public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part
|
||||
public static final int flag_app_dc_creator = 26; // word appears in author
|
||||
public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part
|
||||
public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier
|
||||
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
|
||||
|
||||
public String toPropertyForm();
|
||||
|
||||
public String urlHash();
|
||||
|
|
|
@ -37,57 +37,55 @@ import java.util.TreeMap;
|
|||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.index.RowSet;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.ByteBuffer;
|
||||
|
||||
/**
|
||||
* A ReferenceContainer is a set of ReferenceRows entries. Since ReferenceRow entries are special
|
||||
* Row entries, a collection of ReferenceRows can be contained in a RowSet. This class extends
|
||||
* the RowSet with methods for the handling of special ReferenceRow Row entry objects.
|
||||
* A ReferenceContainer is a set of ReferenceRows entries for a specific term.
|
||||
* Since ReferenceRow entries are special Row entries, a collection of ReferenceRows
|
||||
* can be contained in a RowSet.
|
||||
* This class extends the RowSet with methods for the handling of
|
||||
* special ReferenceRow Row entry objects.
|
||||
*/
|
||||
public class ReferenceContainer extends RowSet {
|
||||
|
||||
private String wordHash;
|
||||
private String termHash;
|
||||
|
||||
public ReferenceContainer(final String wordHash, final RowSet collection) {
|
||||
public ReferenceContainer(final String termHash, final RowSet collection) {
|
||||
super(collection);
|
||||
this.wordHash = wordHash;
|
||||
this.termHash = termHash;
|
||||
}
|
||||
|
||||
public ReferenceContainer(String wordHash, ReferenceRow entry) {
|
||||
super(ReferenceRow.urlEntryRow, 1);
|
||||
this.add(entry);
|
||||
this.wordHash = wordHash;
|
||||
}
|
||||
|
||||
public ReferenceContainer(final String wordHash, final Row rowdef, final int objectCount) {
|
||||
public ReferenceContainer(final String termHash, final Row rowdef, final int objectCount) {
|
||||
super(rowdef, objectCount);
|
||||
this.wordHash = wordHash;
|
||||
this.termHash = termHash;
|
||||
this.lastTimeWrote = 0;
|
||||
}
|
||||
|
||||
public ReferenceContainer topLevelClone() {
|
||||
final ReferenceContainer newContainer = new ReferenceContainer(this.wordHash, this.rowdef, this.size());
|
||||
final ReferenceContainer newContainer = new ReferenceContainer(this.termHash, this.rowdef, this.size());
|
||||
newContainer.addAllUnique(this);
|
||||
return newContainer;
|
||||
}
|
||||
|
||||
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
|
||||
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
|
||||
return new ReferenceContainer(wordHash, WordReferenceRow.urlEntryRow, elementCount);
|
||||
}
|
||||
|
||||
public void setWordHash(final String newWordHash) {
|
||||
this.wordHash = newWordHash;
|
||||
this.termHash = newWordHash;
|
||||
}
|
||||
|
||||
public long updated() {
|
||||
return super.lastWrote();
|
||||
}
|
||||
|
||||
public String getWordHash() {
|
||||
return wordHash;
|
||||
public String getTermHash() {
|
||||
return termHash;
|
||||
}
|
||||
|
||||
public void add(final ReferenceRow entry) {
|
||||
public void add(final WordReferenceRow entry) {
|
||||
// add without double-occurrence test
|
||||
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
|
||||
this.addUnique(entry.toKelondroEntry());
|
||||
|
@ -95,11 +93,11 @@ public class ReferenceContainer extends RowSet {
|
|||
|
||||
public void add(final Reference entry, final long updateTime) {
|
||||
// add without double-occurrence test
|
||||
if (entry instanceof ReferenceRow) {
|
||||
assert ((ReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize;
|
||||
this.add((ReferenceRow) entry);
|
||||
if (entry instanceof WordReferenceRow) {
|
||||
assert ((WordReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize;
|
||||
this.add((WordReferenceRow) entry);
|
||||
} else {
|
||||
this.add(((ReferenceVars) entry).toRowEntry());
|
||||
this.add(((WordReferenceVars) entry).toRowEntry());
|
||||
}
|
||||
this.lastTimeWrote = updateTime;
|
||||
}
|
||||
|
@ -120,24 +118,24 @@ public class ReferenceContainer extends RowSet {
|
|||
}
|
||||
|
||||
public ReferenceContainer merge(final ReferenceContainer c) {
|
||||
return new ReferenceContainer(this.wordHash, super.merge(c));
|
||||
return new ReferenceContainer(this.termHash, super.merge(c));
|
||||
}
|
||||
|
||||
public Reference put(final ReferenceRow entry) {
|
||||
public Reference put(final WordReferenceRow entry) {
|
||||
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
|
||||
final Row.Entry r = super.replace(entry.toKelondroEntry());
|
||||
if (r == null) return null;
|
||||
return new ReferenceRow(r);
|
||||
return new WordReferenceRow(r);
|
||||
}
|
||||
|
||||
public boolean putRecent(final ReferenceRow entry) {
|
||||
public boolean putRecent(final WordReferenceRow entry) {
|
||||
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
|
||||
// returns true if the new entry was added, false if it already existed
|
||||
final Row.Entry oldEntryRow = this.replace(entry.toKelondroEntry());
|
||||
if (oldEntryRow == null) {
|
||||
return true;
|
||||
}
|
||||
final ReferenceRow oldEntry = new ReferenceRow(oldEntryRow);
|
||||
final WordReferenceRow oldEntry = new WordReferenceRow(oldEntryRow);
|
||||
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
|
||||
this.replace(oldEntry.toKelondroEntry()); // put it back
|
||||
return false;
|
||||
|
@ -151,7 +149,7 @@ public class ReferenceContainer extends RowSet {
|
|||
if (c == null) return 0;
|
||||
int x = 0;
|
||||
synchronized (c) {
|
||||
final Iterator<ReferenceRow> i = c.entries();
|
||||
final Iterator<WordReferenceRow> i = c.entries();
|
||||
while (i.hasNext()) {
|
||||
try {
|
||||
if (putRecent(i.next())) x++;
|
||||
|
@ -167,7 +165,7 @@ public class ReferenceContainer extends RowSet {
|
|||
public Reference get(final String urlHash) {
|
||||
final Row.Entry entry = this.get(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new ReferenceRow(entry);
|
||||
return new WordReferenceRow(entry);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -178,7 +176,7 @@ public class ReferenceContainer extends RowSet {
|
|||
public Reference remove(final String urlHash) {
|
||||
final Row.Entry entry = remove(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new ReferenceRow(entry);
|
||||
return new WordReferenceRow(entry);
|
||||
}
|
||||
|
||||
public int removeEntries(final Set<String> urlHashes) {
|
||||
|
@ -188,12 +186,12 @@ public class ReferenceContainer extends RowSet {
|
|||
return count;
|
||||
}
|
||||
|
||||
public Iterator<ReferenceRow> entries() {
|
||||
public Iterator<WordReferenceRow> entries() {
|
||||
// returns an iterator of indexRWIEntry objects
|
||||
return new entryIterator();
|
||||
}
|
||||
|
||||
public class entryIterator implements Iterator<ReferenceRow> {
|
||||
public class entryIterator implements Iterator<WordReferenceRow> {
|
||||
|
||||
Iterator<Row.Entry> rowEntryIterator;
|
||||
|
||||
|
@ -205,10 +203,10 @@ public class ReferenceContainer extends RowSet {
|
|||
return rowEntryIterator.hasNext();
|
||||
}
|
||||
|
||||
public ReferenceRow next() {
|
||||
public WordReferenceRow next() {
|
||||
final Row.Entry rentry = rowEntryIterator.next();
|
||||
if (rentry == null) return null;
|
||||
return new ReferenceRow(rentry);
|
||||
return new WordReferenceRow(rentry);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
@ -342,11 +340,11 @@ public class ReferenceContainer extends RowSet {
|
|||
final int keylength = small.rowdef.width(0);
|
||||
assert (keylength == large.rowdef.width(0));
|
||||
final ReferenceContainer conj = new ReferenceContainer(null, small.rowdef, 0); // start with empty search result
|
||||
final Iterator<ReferenceRow> se = small.entries();
|
||||
ReferenceVars ie0;
|
||||
final Iterator<WordReferenceRow> se = small.entries();
|
||||
WordReferenceVars ie0;
|
||||
Reference ie1;
|
||||
while (se.hasNext()) {
|
||||
ie0 = new ReferenceVars(se.next());
|
||||
ie0 = new WordReferenceVars(se.next());
|
||||
ie1 = large.get(ie0.urlHash());
|
||||
if ((ie0 != null) && (ie1 != null)) {
|
||||
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
|
||||
|
@ -366,13 +364,13 @@ public class ReferenceContainer extends RowSet {
|
|||
assert (keylength == i2.rowdef.width(0));
|
||||
final ReferenceContainer conj = new ReferenceContainer(null, i1.rowdef, 0); // start with empty search result
|
||||
if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef.getOrdering().signature())))) return conj; // ordering must be equal
|
||||
final Iterator<ReferenceRow> e1 = i1.entries();
|
||||
final Iterator<ReferenceRow> e2 = i2.entries();
|
||||
final Iterator<WordReferenceRow> e1 = i1.entries();
|
||||
final Iterator<WordReferenceRow> e2 = i2.entries();
|
||||
int c;
|
||||
if ((e1.hasNext()) && (e2.hasNext())) {
|
||||
ReferenceVars ie1;
|
||||
WordReferenceVars ie1;
|
||||
Reference ie2;
|
||||
ie1 = new ReferenceVars(e1.next());
|
||||
ie1 = new WordReferenceVars(e1.next());
|
||||
ie2 = e2.next();
|
||||
|
||||
while (true) {
|
||||
|
@ -381,14 +379,14 @@ public class ReferenceContainer extends RowSet {
|
|||
c = i1.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
|
||||
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
|
||||
if (c < 0) {
|
||||
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
} else if (c > 0) {
|
||||
if (e2.hasNext()) ie2 = e2.next(); else break;
|
||||
} else {
|
||||
// we have found the same urls in different searches!
|
||||
ie1.join(ie2);
|
||||
if (ie1.worddistance() <= maxDistance) conj.add(ie1.toRowEntry());
|
||||
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
if (e2.hasNext()) ie2 = e2.next(); else break;
|
||||
}
|
||||
}
|
||||
|
@ -420,7 +418,7 @@ public class ReferenceContainer extends RowSet {
|
|||
final int keylength = pivot.rowdef.width(0);
|
||||
assert (keylength == excl.rowdef.width(0));
|
||||
final boolean iterate_pivot = pivot.size() < excl.size();
|
||||
final Iterator<ReferenceRow> se = (iterate_pivot) ? pivot.entries() : excl.entries();
|
||||
final Iterator<WordReferenceRow> se = (iterate_pivot) ? pivot.entries() : excl.entries();
|
||||
Reference ie0, ie1;
|
||||
while (se.hasNext()) {
|
||||
ie0 = se.next();
|
||||
|
@ -439,13 +437,13 @@ public class ReferenceContainer extends RowSet {
|
|||
final int keylength = pivot.rowdef.width(0);
|
||||
assert (keylength == excl.rowdef.width(0));
|
||||
if (!((pivot.rowdef.getOrdering().signature().equals(excl.rowdef.getOrdering().signature())))) return pivot; // ordering must be equal
|
||||
final Iterator<ReferenceRow> e1 = pivot.entries();
|
||||
final Iterator<ReferenceRow> e2 = excl.entries();
|
||||
final Iterator<WordReferenceRow> e1 = pivot.entries();
|
||||
final Iterator<WordReferenceRow> e2 = excl.entries();
|
||||
int c;
|
||||
if ((e1.hasNext()) && (e2.hasNext())) {
|
||||
ReferenceVars ie1;
|
||||
WordReferenceVars ie1;
|
||||
Reference ie2;
|
||||
ie1 = new ReferenceVars(e1.next());
|
||||
ie1 = new WordReferenceVars(e1.next());
|
||||
ie2 = e2.next();
|
||||
|
||||
while (true) {
|
||||
|
@ -454,14 +452,14 @@ public class ReferenceContainer extends RowSet {
|
|||
c = pivot.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
|
||||
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
|
||||
if (c < 0) {
|
||||
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
} else if (c > 0) {
|
||||
if (e2.hasNext()) ie2 = e2.next(); else break;
|
||||
} else {
|
||||
// we have found the same urls in different searches!
|
||||
ie1.join(ie2);
|
||||
e1.remove();
|
||||
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
|
||||
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
|
||||
if (e2.hasNext()) ie2 = e2.next(); else break;
|
||||
}
|
||||
}
|
||||
|
@ -470,11 +468,11 @@ public class ReferenceContainer extends RowSet {
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return "C[" + wordHash + "] has " + this.size() + " entries";
|
||||
return "C[" + termHash + "] has " + this.size() + " entries";
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return (int) Base64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
|
||||
return (int) Base64Order.enhancedCoder.decodeLong(this.termHash.substring(0, 4));
|
||||
}
|
||||
|
||||
|
||||
|
@ -483,7 +481,7 @@ public class ReferenceContainer extends RowSet {
|
|||
final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
|
||||
final TreeMap<String, String> doms = new TreeMap<String, String>();
|
||||
synchronized (inputContainer) {
|
||||
final Iterator<ReferenceRow> i = inputContainer.entries();
|
||||
final Iterator<WordReferenceRow> i = inputContainer.entries();
|
||||
Reference iEntry;
|
||||
String dom, paths;
|
||||
while (i.hasNext()) {
|
||||
|
|
|
@ -56,7 +56,7 @@ public final class ReferenceContainerArray {
|
|||
*/
|
||||
public ReferenceContainerArray(
|
||||
final File heapLocation,
|
||||
final ByteOrder wordOrder,
|
||||
final ByteOrder termOrder,
|
||||
final Row payloadrow,
|
||||
IODispatcher merger) throws IOException {
|
||||
this.payloadrow = payloadrow;
|
||||
|
@ -64,7 +64,7 @@ public final class ReferenceContainerArray {
|
|||
heapLocation,
|
||||
"index",
|
||||
payloadrow.primaryKeyLength,
|
||||
wordOrder,
|
||||
termOrder,
|
||||
0);
|
||||
assert merger != null;
|
||||
this.merger = merger;
|
||||
|
@ -182,8 +182,8 @@ public final class ReferenceContainerArray {
|
|||
* @return true, if the key is used in the heap; false othervise
|
||||
* @throws IOException
|
||||
*/
|
||||
public synchronized boolean has(final String key) {
|
||||
return this.array.has(key.getBytes());
|
||||
public synchronized boolean has(final String termHash) {
|
||||
return this.array.has(termHash.getBytes());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -192,13 +192,13 @@ public final class ReferenceContainerArray {
|
|||
* @return the indexContainer if one exist, null otherwise
|
||||
* @throws IOException
|
||||
*/
|
||||
public synchronized ReferenceContainer get(final String key) throws IOException {
|
||||
List<byte[]> entries = this.array.getAll(key.getBytes());
|
||||
public synchronized ReferenceContainer get(final String termHash) throws IOException {
|
||||
List<byte[]> entries = this.array.getAll(termHash.getBytes());
|
||||
if (entries == null || entries.size() == 0) return null;
|
||||
byte[] a = entries.remove(0);
|
||||
ReferenceContainer c = new ReferenceContainer(key, RowSet.importRowSet(a, payloadrow));
|
||||
ReferenceContainer c = new ReferenceContainer(termHash, RowSet.importRowSet(a, payloadrow));
|
||||
while (entries.size() > 0) {
|
||||
c = c.merge(new ReferenceContainer(key, RowSet.importRowSet(entries.remove(0), payloadrow)));
|
||||
c = c.merge(new ReferenceContainer(termHash, RowSet.importRowSet(entries.remove(0), payloadrow)));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
@ -209,13 +209,13 @@ public final class ReferenceContainerArray {
|
|||
* @return the indexContainer if the cache contained the container, null othervise
|
||||
* @throws IOException
|
||||
*/
|
||||
public synchronized void delete(final String wordHash) throws IOException {
|
||||
public synchronized void delete(final String termHash) throws IOException {
|
||||
// returns the index that had been deleted
|
||||
array.remove(wordHash.getBytes());
|
||||
array.remove(termHash.getBytes());
|
||||
}
|
||||
|
||||
public synchronized int replace(final String wordHash, ContainerRewriter rewriter) throws IOException {
|
||||
return array.replace(wordHash.getBytes(), new BLOBRewriter(wordHash, rewriter));
|
||||
public synchronized int replace(final String termHash, ContainerRewriter rewriter) throws IOException {
|
||||
return array.replace(termHash.getBytes(), new BLOBRewriter(termHash, rewriter));
|
||||
}
|
||||
|
||||
public class BLOBRewriter implements BLOB.Rewriter {
|
||||
|
|
|
@ -41,6 +41,7 @@ import de.anomic.kelondro.blob.HeapWriter;
|
|||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.ByteOrder;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
|
@ -49,7 +50,7 @@ import de.anomic.kelondro.index.RowSet;
|
|||
public final class ReferenceContainerCache extends AbstractIndex implements Index, IndexReader, Iterable<ReferenceContainer> {
|
||||
|
||||
private final Row payloadrow;
|
||||
private final ByteOrder wordOrder;
|
||||
private final ByteOrder termOrder;
|
||||
private SortedMap<String, ReferenceContainer> cache;
|
||||
|
||||
/**
|
||||
|
@ -59,9 +60,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
* @param payloadrow
|
||||
* @param log
|
||||
*/
|
||||
public ReferenceContainerCache(final Row payloadrow, ByteOrder wordOrder) {
|
||||
public ReferenceContainerCache(final Row payloadrow, ByteOrder termOrder) {
|
||||
this.payloadrow = payloadrow;
|
||||
this.wordOrder = wordOrder;
|
||||
this.termOrder = termOrder;
|
||||
this.cache = null;
|
||||
}
|
||||
|
||||
|
@ -83,7 +84,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
* another dump reading afterwards is not possible
|
||||
*/
|
||||
public void initWriteMode() {
|
||||
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.wordOrder)));
|
||||
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.termOrder)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -94,14 +95,14 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
public void initWriteModeFromBLOB(final File blobFile) throws IOException {
|
||||
Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'");
|
||||
final long start = System.currentTimeMillis();
|
||||
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.wordOrder)));
|
||||
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.termOrder)));
|
||||
int urlCount = 0;
|
||||
synchronized (cache) {
|
||||
for (final ReferenceContainer container : new blobFileEntries(blobFile, this.payloadrow)) {
|
||||
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
|
||||
if (container == null) break;
|
||||
//System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
|
||||
cache.put(container.getWordHash(), container);
|
||||
cache.put(container.getTermHash(), container);
|
||||
urlCount += container.size();
|
||||
}
|
||||
}
|
||||
|
@ -242,7 +243,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
for (ReferenceContainer container : cache.values()) {
|
||||
if (container.size() > max) {
|
||||
max = container.size();
|
||||
hash = container.getWordHash();
|
||||
hash = container.getTermHash();
|
||||
}
|
||||
}
|
||||
return hash;
|
||||
|
@ -253,7 +254,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
ArrayList<String> hashes = new ArrayList<String>();
|
||||
for (ReferenceContainer container : cache.values()) {
|
||||
if (container.size() >= bound) {
|
||||
hashes.add(container.getWordHash());
|
||||
hashes.add(container.getTermHash());
|
||||
}
|
||||
}
|
||||
return hashes;
|
||||
|
@ -281,7 +282,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
ArrayList<String> hashes = new ArrayList<String>();
|
||||
long limit = System.currentTimeMillis() - maxage;
|
||||
for (ReferenceContainer container : cache.values()) {
|
||||
if (container.lastWrote() < limit) hashes.add(container.getWordHash());
|
||||
if (container.lastWrote() < limit) hashes.add(container.getTermHash());
|
||||
}
|
||||
return hashes;
|
||||
}
|
||||
|
@ -372,9 +373,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
ReferenceContainer c = this.cache.get(key);
|
||||
if (c == null) return null;
|
||||
// because this is all in RAM, we must clone the entries (flat)
|
||||
ReferenceContainer c1 = new ReferenceContainer(c.getWordHash(), c.row(), c.size());
|
||||
Iterator<ReferenceRow> e = c.entries();
|
||||
ReferenceRow ee;
|
||||
ReferenceContainer c1 = new ReferenceContainer(c.getTermHash(), c.row(), c.size());
|
||||
Iterator<WordReferenceRow> e = c.entries();
|
||||
WordReferenceRow ee;
|
||||
while (e.hasNext()) {
|
||||
ee = e.next();
|
||||
if (urlselection.contains(ee.urlHash())) c1.add(ee);
|
||||
|
@ -441,7 +442,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
if (this.cache == null || container == null || container.size() == 0) return;
|
||||
|
||||
// put new words into cache
|
||||
final String wordHash = container.getWordHash();
|
||||
final String wordHash = container.getTermHash();
|
||||
ReferenceContainer entries = cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
|
||||
int added = 0;
|
||||
if (entries == null) {
|
||||
|
@ -457,7 +458,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
return;
|
||||
}
|
||||
|
||||
public synchronized void add(final String wordHash, final ReferenceRow newEntry) {
|
||||
public synchronized void add(final String wordHash, final WordReferenceRow newEntry) {
|
||||
assert this.cache != null;
|
||||
ReferenceContainer container = cache.get(wordHash);
|
||||
if (container == null) container = new ReferenceContainer(wordHash, this.payloadrow, 1);
|
||||
|
@ -470,7 +471,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
|
|||
}
|
||||
|
||||
public ByteOrder ordering() {
|
||||
return this.wordOrder;
|
||||
return this.termOrder;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
|
|||
}
|
||||
|
||||
public boolean wellformed(final ReferenceContainer a) {
|
||||
return embeddedOrder.wellformed(a.getWordHash().getBytes());
|
||||
return embeddedOrder.wellformed(a.getTermHash().getBytes());
|
||||
}
|
||||
|
||||
public void direction(final boolean ascending) {
|
||||
|
@ -50,15 +50,15 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
|
|||
}
|
||||
|
||||
public int compare(final ReferenceContainer a, final ReferenceContainer b) {
|
||||
return this.embeddedOrder.compare(a.getWordHash().getBytes(), b.getWordHash().getBytes());
|
||||
return this.embeddedOrder.compare(a.getTermHash().getBytes(), b.getTermHash().getBytes());
|
||||
}
|
||||
|
||||
public boolean equal(ReferenceContainer a, ReferenceContainer b) {
|
||||
return this.embeddedOrder.equal(a.getWordHash().getBytes(), b.getWordHash().getBytes());
|
||||
return this.embeddedOrder.equal(a.getTermHash().getBytes(), b.getTermHash().getBytes());
|
||||
}
|
||||
|
||||
public void rotate(final ReferenceContainer zero) {
|
||||
this.embeddedOrder.rotate(zero.getWordHash().getBytes());
|
||||
this.embeddedOrder.rotate(zero.getTermHash().getBytes());
|
||||
this.zero = new ReferenceContainer(new String(this.embeddedOrder.zero()), zero);
|
||||
}
|
||||
|
||||
|
@ -80,7 +80,7 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
|
|||
}
|
||||
|
||||
public long cardinal(final ReferenceContainer key) {
|
||||
return this.embeddedOrder.cardinal(key.getWordHash().getBytes());
|
||||
return this.embeddedOrder.cardinal(key.getTermHash().getBytes());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -32,15 +32,17 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.plasma.plasmaCondenser;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess;
|
||||
import de.anomic.plasma.plasmaSearchRankingProfile;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverProcessor;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class ReferenceOrder {
|
||||
private ReferenceVars min, max;
|
||||
private WordReferenceVars min, max;
|
||||
private final plasmaSearchRankingProfile ranking;
|
||||
private final ScoreCluster<String> doms; // collected for "authority" heuristic
|
||||
private int maxdomcount;
|
||||
|
@ -55,10 +57,10 @@ public class ReferenceOrder {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public ArrayList<ReferenceVars> normalizeWith(final ReferenceContainer container) {
|
||||
public ArrayList<WordReferenceVars> normalizeWith(final ReferenceContainer container) {
|
||||
// normalize ranking: find minimum and maxiumum of separate ranking criteria
|
||||
assert (container != null);
|
||||
ArrayList<ReferenceVars> result = null;
|
||||
ArrayList<WordReferenceVars> result = null;
|
||||
|
||||
//long s0 = System.currentTimeMillis();
|
||||
if ((serverProcessor.useCPU > 1) && (container.size() > 600)) {
|
||||
|
@ -112,7 +114,7 @@ public class ReferenceOrder {
|
|||
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
|
||||
}
|
||||
|
||||
public long cardinal(final ReferenceVars t) {
|
||||
public long cardinal(final WordReferenceVars t) {
|
||||
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
|
||||
// the normalizedEntry must be a normalized indexEntry
|
||||
final Bitfield flags = t.flags();
|
||||
|
@ -136,17 +138,17 @@ public class ReferenceOrder {
|
|||
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
|
||||
+ tf
|
||||
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
|
||||
+ ((flags.get(Reference.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
|
||||
+ ((flags.get(Reference.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
|
||||
+ ((flags.get(Reference.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
|
||||
+ ((flags.get(Reference.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
|
||||
+ ((flags.get(Reference.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
|
||||
+ ((flags.get(Reference.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
|
||||
+ ((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
|
||||
+ ((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
|
||||
+ ((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
|
||||
+ ((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
|
||||
+ ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0)
|
||||
+ ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0);
|
||||
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
|
||||
|
@ -161,13 +163,13 @@ public class ReferenceOrder {
|
|||
|
||||
public static class minmaxfinder extends Thread {
|
||||
|
||||
ReferenceVars entryMin;
|
||||
ReferenceVars entryMax;
|
||||
WordReferenceVars entryMin;
|
||||
WordReferenceVars entryMax;
|
||||
private final ReferenceContainer container;
|
||||
private final int start, end;
|
||||
private final HashMap<String, Integer> doms;
|
||||
private final Integer int1;
|
||||
ArrayList<ReferenceVars> decodedEntries;
|
||||
ArrayList<WordReferenceVars> decodedEntries;
|
||||
|
||||
public minmaxfinder(final ReferenceContainer container, final int start /*including*/, final int end /*excluding*/) {
|
||||
this.container = container;
|
||||
|
@ -175,19 +177,19 @@ public class ReferenceOrder {
|
|||
this.end = end;
|
||||
this.doms = new HashMap<String, Integer>();
|
||||
this.int1 = 1;
|
||||
this.decodedEntries = new ArrayList<ReferenceVars>();
|
||||
this.decodedEntries = new ArrayList<WordReferenceVars>();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
// find min/max to obtain limits for normalization
|
||||
this.entryMin = null;
|
||||
this.entryMax = null;
|
||||
ReferenceVars iEntry;
|
||||
WordReferenceVars iEntry;
|
||||
int p = this.start;
|
||||
String dom;
|
||||
Integer count;
|
||||
while (p < this.end) {
|
||||
iEntry = new ReferenceVars(new ReferenceRow(container.get(p++, false)));
|
||||
iEntry = new WordReferenceVars(new WordReferenceRow(container.get(p++, false)));
|
||||
this.decodedEntries.add(iEntry);
|
||||
// find min/max
|
||||
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
|
||||
|
@ -203,7 +205,7 @@ public class ReferenceOrder {
|
|||
}
|
||||
}
|
||||
|
||||
public ArrayList<ReferenceVars> decodedContainer() {
|
||||
public ArrayList<WordReferenceVars> decodedContainer() {
|
||||
return this.decodedEntries;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
// URLMetadata.java
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 02.03.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class URLMetadata {
|
||||
private yacyURL url;
|
||||
private final String dc_title, dc_creator, dc_subject, ETag;
|
||||
|
||||
public URLMetadata(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) {
|
||||
try {
|
||||
this.url = new yacyURL(url, urlhash);
|
||||
} catch (final MalformedURLException e) {
|
||||
this.url = null;
|
||||
}
|
||||
this.dc_title = title;
|
||||
this.dc_creator = author;
|
||||
this.dc_subject = tags;
|
||||
this.ETag = ETag;
|
||||
}
|
||||
public URLMetadata(final yacyURL url, final String descr, final String author, final String tags, final String ETag) {
|
||||
this.url = url;
|
||||
this.dc_title = descr;
|
||||
this.dc_creator = author;
|
||||
this.dc_subject = tags;
|
||||
this.ETag = ETag;
|
||||
}
|
||||
public yacyURL url() { return this.url; }
|
||||
public String dc_title() { return this.dc_title; }
|
||||
public String dc_creator() { return this.dc_creator; }
|
||||
public String dc_subject() { return this.dc_subject; }
|
||||
public String ETag() { return this.ETag; }
|
||||
|
||||
}
|
|
@ -4,9 +4,9 @@
|
|||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.kelondro.text.metadataPrototype;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -39,6 +39,9 @@ import de.anomic.kelondro.order.Base64Order;
|
|||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Metadata;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.kelondroException;
|
||||
|
@ -48,7 +51,7 @@ import de.anomic.server.serverCodings;
|
|||
import de.anomic.tools.crypt;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class MetadataRowContainer {
|
||||
public class URLMetadataRow implements Metadata {
|
||||
|
||||
// this object stores attributes for URL entries
|
||||
|
||||
|
@ -119,7 +122,7 @@ public class MetadataRowContainer {
|
|||
private Reference word; // this is only used if the url is transported via remote search requests
|
||||
private final long ranking; // during generation of a search result this value is set
|
||||
|
||||
public MetadataRowContainer(
|
||||
public URLMetadataRow(
|
||||
final yacyURL url,
|
||||
final String dc_title,
|
||||
final String dc_creator,
|
||||
|
@ -198,14 +201,14 @@ public class MetadataRowContainer {
|
|||
}
|
||||
}
|
||||
|
||||
public MetadataRowContainer(final Row.Entry entry, final Reference searchedWord, final long ranking) {
|
||||
public URLMetadataRow(final Row.Entry entry, final Reference searchedWord, final long ranking) {
|
||||
this.entry = entry;
|
||||
this.snippet = null;
|
||||
this.word = searchedWord;
|
||||
this.ranking = ranking;
|
||||
}
|
||||
|
||||
public MetadataRowContainer(final Properties prop) {
|
||||
public URLMetadataRow(final Properties prop) {
|
||||
// generates an plasmaLURLEntry using the properties from the argument
|
||||
// the property names must correspond to the one from toString
|
||||
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
|
||||
|
@ -264,17 +267,17 @@ public class MetadataRowContainer {
|
|||
this.word = null;
|
||||
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
|
||||
if (prop.containsKey("wi")) {
|
||||
this.word = new ReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()"));
|
||||
this.word = new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()"));
|
||||
}
|
||||
this.ranking = 0;
|
||||
}
|
||||
|
||||
public static MetadataRowContainer importEntry(final String propStr) {
|
||||
public static URLMetadataRow importEntry(final String propStr) {
|
||||
if (propStr == null || !propStr.startsWith("{") || !propStr.endsWith("}")) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return new MetadataRowContainer(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
|
||||
return new URLMetadataRow(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
|
||||
} catch (final kelondroException e) {
|
||||
// wrong format
|
||||
return null;
|
||||
|
@ -283,7 +286,7 @@ public class MetadataRowContainer {
|
|||
|
||||
private StringBuilder corePropList() {
|
||||
// generate a parseable string; this is a simple property-list
|
||||
final URLMetadata metadata = this.metadata();
|
||||
final Components metadata = this.metadata();
|
||||
final StringBuilder s = new StringBuilder(300);
|
||||
//System.out.println("author=" + comp.author());
|
||||
try {
|
||||
|
@ -341,9 +344,9 @@ public class MetadataRowContainer {
|
|||
return this.ranking;
|
||||
}
|
||||
|
||||
public URLMetadata metadata() {
|
||||
public Components metadata() {
|
||||
final ArrayList<String> cl = FileUtils.strings(this.entry.getCol("comp", null), "UTF-8");
|
||||
return new URLMetadata(
|
||||
return new Components(
|
||||
(cl.size() > 0) ? (cl.get(0)).trim() : "",
|
||||
hash(),
|
||||
(cl.size() > 1) ? (cl.get(1)).trim() : "",
|
||||
|
@ -428,7 +431,7 @@ public class MetadataRowContainer {
|
|||
return word;
|
||||
}
|
||||
|
||||
public boolean isOlder(final MetadataRowContainer other) {
|
||||
public boolean isOlder(final Metadata other) {
|
||||
if (other == null) return false;
|
||||
final Date tmoddate = moddate();
|
||||
final Date omoddate = other.moddate();
|
||||
|
@ -487,4 +490,33 @@ public class MetadataRowContainer {
|
|||
//return "{" + core + "}";
|
||||
}
|
||||
|
||||
public class Components {
|
||||
private yacyURL url;
|
||||
private final String dc_title, dc_creator, dc_subject, ETag;
|
||||
|
||||
public Components(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) {
|
||||
try {
|
||||
this.url = new yacyURL(url, urlhash);
|
||||
} catch (final MalformedURLException e) {
|
||||
this.url = null;
|
||||
}
|
||||
this.dc_title = title;
|
||||
this.dc_creator = author;
|
||||
this.dc_subject = tags;
|
||||
this.ETag = ETag;
|
||||
}
|
||||
public Components(final yacyURL url, final String descr, final String author, final String tags, final String ETag) {
|
||||
this.url = url;
|
||||
this.dc_title = descr;
|
||||
this.dc_creator = author;
|
||||
this.dc_subject = tags;
|
||||
this.ETag = ETag;
|
||||
}
|
||||
public yacyURL url() { return this.url; }
|
||||
public String dc_title() { return this.dc_title; }
|
||||
public String dc_creator() { return this.dc_creator; }
|
||||
public String dc_subject() { return this.dc_subject; }
|
||||
public String ETag() { return this.ETag; }
|
||||
|
||||
}
|
||||
}
|
|
@ -4,9 +4,9 @@
|
|||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.kelondro.text.referencePrototype;
|
||||
|
||||
import de.anomic.kelondro.index.Column;
|
||||
import de.anomic.kelondro.index.Row;
|
||||
|
@ -32,9 +32,10 @@ import de.anomic.kelondro.index.Row.Entry;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.MicroDate;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public final class ReferenceRow implements Reference, Cloneable {
|
||||
public final class WordReferenceRow implements Reference, Cloneable {
|
||||
|
||||
// this object stores attributes to URL references inside RWI collections
|
||||
|
||||
|
@ -88,9 +89,19 @@ public final class ReferenceRow implements Reference, Cloneable {
|
|||
private static final int col_reserve1 = 18; // i 1 reserve1
|
||||
private static final int col_reserve2 = 19; // k 1 reserve2
|
||||
|
||||
// appearance flags, used in RWI entry
|
||||
// some names are derived from the Dublin Core Metadata tag set
|
||||
// the flags 0..23 are identical to the category flags in plasmaCondenser
|
||||
public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
|
||||
public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part
|
||||
public static final int flag_app_dc_creator = 26; // word appears in author
|
||||
public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part
|
||||
public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier
|
||||
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
|
||||
|
||||
private final Row.Entry entry;
|
||||
|
||||
public ReferenceRow(final String urlHash,
|
||||
public WordReferenceRow(final String urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final int titleLength, // length of description/length (longer are better?)
|
||||
|
@ -135,32 +146,32 @@ public final class ReferenceRow implements Reference, Cloneable {
|
|||
this.entry.setCol(col_reserve2, 0);
|
||||
}
|
||||
|
||||
public ReferenceRow(final String urlHash, final String code) {
|
||||
public WordReferenceRow(final String urlHash, final String code) {
|
||||
// the code is the external form of the row minus the leading urlHash entry
|
||||
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
|
||||
}
|
||||
|
||||
public ReferenceRow(final String external) {
|
||||
public WordReferenceRow(final String external) {
|
||||
this.entry = urlEntryRow.newEntry(external, true);
|
||||
}
|
||||
|
||||
public ReferenceRow(final byte[] row) {
|
||||
public WordReferenceRow(final byte[] row) {
|
||||
this.entry = urlEntryRow.newEntry(row);
|
||||
}
|
||||
|
||||
public ReferenceRow(final byte[] row, final int offset, final boolean clone) {
|
||||
public WordReferenceRow(final byte[] row, final int offset, final boolean clone) {
|
||||
this.entry = urlEntryRow.newEntry(row, offset, clone);
|
||||
}
|
||||
|
||||
public ReferenceRow(final Row.Entry rentry) {
|
||||
public WordReferenceRow(final Row.Entry rentry) {
|
||||
// FIXME: see if cloning is necessary
|
||||
this.entry = rentry;
|
||||
}
|
||||
|
||||
public ReferenceRow clone() {
|
||||
public WordReferenceRow clone() {
|
||||
final byte[] b = new byte[urlEntryRow.objectsize];
|
||||
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize);
|
||||
return new ReferenceRow(b);
|
||||
return new WordReferenceRow(b);
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
|
@ -4,9 +4,9 @@
|
|||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
|
@ -24,12 +24,13 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.kelondro.text.referencePrototype;
|
||||
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.MicroDate;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
|
||||
public class ReferenceVars implements Reference, Cloneable {
|
||||
public class WordReferenceVars implements Reference, Cloneable {
|
||||
|
||||
public Bitfield flags;
|
||||
public long freshUntil, lastModified;
|
||||
|
@ -41,7 +42,7 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
worddistance, wordsintext, wordsintitle;
|
||||
public double termFrequency;
|
||||
|
||||
public ReferenceVars(final String urlHash,
|
||||
public WordReferenceVars(final String urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final int titleLength, // length of description/length (longer are better?)
|
||||
|
@ -86,7 +87,7 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
this.termFrequency = termfrequency;
|
||||
}
|
||||
|
||||
public ReferenceVars(final ReferenceRow e) {
|
||||
public WordReferenceVars(final WordReferenceRow e) {
|
||||
this.flags = e.flags();
|
||||
this.freshUntil = e.freshUntil();
|
||||
this.lastModified = e.lastModified();
|
||||
|
@ -109,8 +110,8 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
this.termFrequency = e.termFrequency();
|
||||
}
|
||||
|
||||
public ReferenceVars clone() {
|
||||
final ReferenceVars c = new ReferenceVars(
|
||||
public WordReferenceVars clone() {
|
||||
final WordReferenceVars c = new WordReferenceVars(
|
||||
this.urlHash,
|
||||
this.urllength,
|
||||
this.urlcomps,
|
||||
|
@ -133,7 +134,7 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
return c;
|
||||
}
|
||||
|
||||
public void join(final ReferenceVars oe) {
|
||||
public void join(final WordReferenceVars oe) {
|
||||
// combine the distance
|
||||
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
|
||||
this.posintext = Math.min(this.posintext, oe.posintext);
|
||||
|
@ -203,8 +204,8 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
return posofphrase;
|
||||
}
|
||||
|
||||
public ReferenceRow toRowEntry() {
|
||||
return new ReferenceRow(
|
||||
public WordReferenceRow toRowEntry() {
|
||||
return new WordReferenceRow(
|
||||
urlHash,
|
||||
urllength, // byte-length of complete URL
|
||||
urlcomps, // number of path components
|
||||
|
@ -262,7 +263,7 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
return this.termFrequency;
|
||||
}
|
||||
|
||||
public final void min(final ReferenceVars other) {
|
||||
public final void min(final WordReferenceVars other) {
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
||||
|
@ -284,7 +285,7 @@ public class ReferenceVars implements Reference, Cloneable {
|
|||
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public final void max(final ReferenceVars other) {
|
||||
public final void max(final WordReferenceVars other) {
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
|
@ -23,7 +23,7 @@
|
|||
// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
|
||||
// execute with java -cp source de.anomic.plasma.plasmaCondenser
|
||||
|
||||
package de.anomic.plasma;
|
||||
package de.anomic.plasma.parser;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
@ -49,14 +49,13 @@ import java.util.TreeSet;
|
|||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.Phrase;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.language.identification.Identificator;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public final class plasmaCondenser {
|
||||
public final class Condenser {
|
||||
|
||||
// this is the page analysis class
|
||||
|
||||
|
@ -101,7 +100,7 @@ public final class plasmaCondenser {
|
|||
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
||||
Identificator languageIdentificator;
|
||||
|
||||
public plasmaCondenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException {
|
||||
public Condenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException {
|
||||
// if addMedia == true, then all the media links are also parsed and added to the words
|
||||
// added media words are flagged with the appropriate media flag
|
||||
this.wordminsize = 3;
|
||||
|
@ -133,13 +132,13 @@ public final class plasmaCondenser {
|
|||
// phrase 99 is taken from the media Link url and anchor description
|
||||
// phrase 100 and above are lines from the text
|
||||
|
||||
insertTextToWords(document.dc_title(), 1, Reference.flag_app_dc_title, RESULT_FLAGS, true);
|
||||
insertTextToWords(document.dc_description(), 3, Reference.flag_app_dc_description, RESULT_FLAGS, true);
|
||||
insertTextToWords(document.dc_creator(), 4, Reference.flag_app_dc_creator, RESULT_FLAGS, true);
|
||||
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true);
|
||||
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
|
||||
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
|
||||
// missing: tags!
|
||||
final String[] titles = document.getSectionTitles();
|
||||
for (int i = 0; i < titles.length; i++) {
|
||||
insertTextToWords(titles[i], i + 10, Reference.flag_app_emphasized, RESULT_FLAGS, true);
|
||||
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true);
|
||||
}
|
||||
|
||||
// anchors: for text indexing we add only the anchor description
|
||||
|
@ -164,7 +163,7 @@ public final class plasmaCondenser {
|
|||
}
|
||||
|
||||
// add the URL components to the word list
|
||||
insertTextToWords(document.dc_source().toNormalform(false, true), 0, Reference.flag_app_dc_identifier, RESULT_FLAGS, false);
|
||||
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false);
|
||||
|
||||
if (indexMedia) {
|
||||
// add anchor descriptions: here, we also add the url components
|
||||
|
@ -241,11 +240,11 @@ public final class plasmaCondenser {
|
|||
}
|
||||
}
|
||||
|
||||
public plasmaCondenser(final InputStream text, final String charset) throws UnsupportedEncodingException {
|
||||
public Condenser(final InputStream text, final String charset) throws UnsupportedEncodingException {
|
||||
this(text, charset, 3, 2);
|
||||
}
|
||||
|
||||
public plasmaCondenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException {
|
||||
public Condenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException {
|
||||
this.wordminsize = wordminsize;
|
||||
this.wordcut = wordcut;
|
||||
this.languageIdentificator = null; // we don't need that here
|
||||
|
@ -715,7 +714,7 @@ public final class plasmaCondenser {
|
|||
buffer = new ByteArrayInputStream(text.getBytes());
|
||||
}
|
||||
try {
|
||||
return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
|
||||
return new Condenser(buffer, "UTF-8", 2, 1).words();
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
return null;
|
||||
}
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.plasma.parser;
|
||||
|
||||
import java.util.Date;
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.plasma.parser;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
package de.anomic.plasma.parser;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
|
@ -6,10 +6,10 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.crawler.AbstractImporter;
|
||||
import de.anomic.crawler.Importer;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
|
||||
public class plasmaDbImporter extends AbstractImporter implements Importer {
|
||||
|
@ -109,11 +109,11 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
try {
|
||||
this.wordCounter++;
|
||||
newContainer = indexContainerIterator.next();
|
||||
this.wordHash = newContainer.getWordHash();
|
||||
this.wordHash = newContainer.getTermHash();
|
||||
|
||||
// loop throug the entities of the container and get the
|
||||
// urlhash
|
||||
final Iterator<ReferenceRow> importWordIdxEntries = newContainer.entries();
|
||||
final Iterator<WordReferenceRow> importWordIdxEntries = newContainer.entries();
|
||||
Reference importWordIdxEntry;
|
||||
while (importWordIdxEntries.hasNext()) {
|
||||
// testing if import process was aborted
|
||||
|
@ -141,7 +141,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
// we need to import the url
|
||||
|
||||
// getting the url entry
|
||||
final MetadataRowContainer urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0);
|
||||
final URLMetadataRow urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0);
|
||||
if (urlEntry != null) {
|
||||
|
||||
/* write it into the home url db */
|
||||
|
@ -206,7 +206,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
final TreeSet<ReferenceContainer> containers = this.importWordIndex.index().references(this.wordHash, false, 100, false);
|
||||
indexContainerIterator = containers.iterator();
|
||||
// Make sure we don't get the same wordhash twice, but don't skip a word
|
||||
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) {
|
||||
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
|
||||
indexContainerIterator = containers.iterator();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,9 +48,9 @@ import de.anomic.kelondro.blob.BLOBCompressor;
|
|||
import de.anomic.kelondro.blob.BLOBHeap;
|
||||
import de.anomic.kelondro.blob.MapView;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.parser.Parser;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverCachedFileOutputStream;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
@ -282,7 +283,7 @@ dc_rights
|
|||
|
||||
public Iterator<StringBuilder> getSentences(final boolean pre) {
|
||||
if (this.text == null) return null;
|
||||
final plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText());
|
||||
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
|
||||
e.pre(pre);
|
||||
return e;
|
||||
}
|
||||
|
@ -439,7 +440,7 @@ dc_rights
|
|||
this.favicon = faviconURL;
|
||||
}
|
||||
|
||||
public void notifyWebStructure(final plasmaWebStructure webStructure, final plasmaCondenser condenser, final Date docDate) {
|
||||
public void notifyWebStructure(final plasmaWebStructure webStructure, final Condenser condenser, final Date docDate) {
|
||||
final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
|
||||
this.inboundLinks = ioLinks[0].intValue();
|
||||
this.outboundLinks = ioLinks[1].intValue();
|
||||
|
|
|
@ -388,7 +388,7 @@ public class plasmaRankingCRProcess {
|
|||
CloneableIterator<Row.Entry> cr_entry;
|
||||
while (i.hasNext()) {
|
||||
keycollection = i.next();
|
||||
referee = keycollection.getWordHash();
|
||||
referee = keycollection.getTermHash();
|
||||
if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6);
|
||||
cr_entry = keycollection.rows();
|
||||
|
||||
|
|
|
@ -32,12 +32,13 @@ import java.util.Date;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -54,17 +55,17 @@ public class plasmaSearchAPI {
|
|||
if (post.get("flags","").length() == 0) return null;
|
||||
return new Bitfield(4, post.get("flags"));
|
||||
}
|
||||
if (post.get("description", "").equals("on")) b.set(Reference.flag_app_dc_description, true);
|
||||
if (post.get("title", "").equals("on")) b.set(Reference.flag_app_dc_title, true);
|
||||
if (post.get("creator", "").equals("on")) b.set(Reference.flag_app_dc_creator, true);
|
||||
if (post.get("subject", "").equals("on")) b.set(Reference.flag_app_dc_subject, true);
|
||||
if (post.get("url", "").equals("on")) b.set(Reference.flag_app_dc_identifier, true);
|
||||
if (post.get("emphasized", "").equals("on")) b.set(Reference.flag_app_emphasized, true);
|
||||
if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true);
|
||||
if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true);
|
||||
if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true);
|
||||
if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true);
|
||||
if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true);
|
||||
if (post.get("description", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_description, true);
|
||||
if (post.get("title", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_title, true);
|
||||
if (post.get("creator", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_creator, true);
|
||||
if (post.get("subject", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_subject, true);
|
||||
if (post.get("url", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_identifier, true);
|
||||
if (post.get("emphasized", "").equals("on")) b.set(WordReferenceRow.flag_app_emphasized, true);
|
||||
if (post.get("image", "").equals("on")) b.set(Condenser.flag_cat_hasimage, true);
|
||||
if (post.get("audio", "").equals("on")) b.set(Condenser.flag_cat_hasaudio, true);
|
||||
if (post.get("video", "").equals("on")) b.set(Condenser.flag_cat_hasvideo, true);
|
||||
if (post.get("app", "").equals("on")) b.set(Condenser.flag_cat_hasapp, true);
|
||||
if (post.get("indexof", "").equals("on")) b.set(Condenser.flag_cat_indexof, true);
|
||||
return b;
|
||||
}
|
||||
|
||||
|
@ -96,17 +97,17 @@ public class plasmaSearchAPI {
|
|||
} else {
|
||||
prop.put("searchresult", 3);
|
||||
prop.put("searchresult_allurl", ranked.filteredCount());
|
||||
prop.put("searchresult_description", ranked.flagCount()[Reference.flag_app_dc_description]);
|
||||
prop.put("searchresult_title", ranked.flagCount()[Reference.flag_app_dc_title]);
|
||||
prop.put("searchresult_creator", ranked.flagCount()[Reference.flag_app_dc_creator]);
|
||||
prop.put("searchresult_subject", ranked.flagCount()[Reference.flag_app_dc_subject]);
|
||||
prop.put("searchresult_url", ranked.flagCount()[Reference.flag_app_dc_identifier]);
|
||||
prop.put("searchresult_emphasized", ranked.flagCount()[Reference.flag_app_emphasized]);
|
||||
prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
|
||||
prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
|
||||
prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
|
||||
prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
|
||||
prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
|
||||
prop.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]);
|
||||
prop.put("searchresult_title", ranked.flagCount()[WordReferenceRow.flag_app_dc_title]);
|
||||
prop.put("searchresult_creator", ranked.flagCount()[WordReferenceRow.flag_app_dc_creator]);
|
||||
prop.put("searchresult_subject", ranked.flagCount()[WordReferenceRow.flag_app_dc_subject]);
|
||||
prop.put("searchresult_url", ranked.flagCount()[WordReferenceRow.flag_app_dc_identifier]);
|
||||
prop.put("searchresult_emphasized", ranked.flagCount()[WordReferenceRow.flag_app_emphasized]);
|
||||
prop.put("searchresult_image", ranked.flagCount()[Condenser.flag_cat_hasimage]);
|
||||
prop.put("searchresult_audio", ranked.flagCount()[Condenser.flag_cat_hasaudio]);
|
||||
prop.put("searchresult_video", ranked.flagCount()[Condenser.flag_cat_hasvideo]);
|
||||
prop.put("searchresult_app", ranked.flagCount()[Condenser.flag_cat_hasapp]);
|
||||
prop.put("searchresult_indexof", ranked.flagCount()[Condenser.flag_cat_indexof]);
|
||||
}
|
||||
return ranked;
|
||||
}
|
||||
|
@ -126,7 +127,7 @@ public class plasmaSearchAPI {
|
|||
prop.put("genUrlList_lines", maxlines);
|
||||
int i = 0;
|
||||
yacyURL url;
|
||||
MetadataRowContainer entry;
|
||||
URLMetadataRow entry;
|
||||
String us;
|
||||
long rn = -1;
|
||||
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) {
|
||||
|
@ -161,17 +162,17 @@ public class plasmaSearchAPI {
|
|||
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
|
||||
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
|
||||
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
|
||||
((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
|
||||
((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
|
||||
((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
|
||||
((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
|
||||
((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_dc_identifier)) ? "appears in url, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_dc_title)) ? "appears in title, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_dc_creator)) ? "appears in author, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_dc_subject)) ? "appears in subject, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_dc_description)) ? "appears in description, " : "") +
|
||||
((entry.word().flags().get(Reference.flag_app_emphasized)) ? "appears emphasized, " : "") +
|
||||
((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") +
|
||||
((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") +
|
||||
((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
|
||||
((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") +
|
||||
((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
|
||||
((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
|
||||
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
|
||||
);
|
||||
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) {
|
||||
|
|
|
@ -39,17 +39,17 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceVars;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.kelondro.util.SortStack;
|
||||
import de.anomic.kelondro.util.SortStore;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
|
||||
import de.anomic.server.serverProfiling;
|
||||
import de.anomic.yacy.yacySearch;
|
||||
|
@ -180,7 +180,7 @@ public final class plasmaSearchEvent {
|
|||
for (Map.Entry<String, ReferenceContainer> entry : this.rankedCache.searchContainerMaps()[0].entrySet()) {
|
||||
wordhash = entry.getKey();
|
||||
final ReferenceContainer container = entry.getValue();
|
||||
assert (container.getWordHash().equals(wordhash));
|
||||
assert (container.getTermHash().equals(wordhash));
|
||||
if (container.size() > maxcount) {
|
||||
IAmaxcounthash = wordhash;
|
||||
maxcount = container.size();
|
||||
|
@ -264,7 +264,7 @@ public final class plasmaSearchEvent {
|
|||
}
|
||||
}
|
||||
|
||||
ResultEntry obtainResultEntry(final MetadataRowContainer page, final int snippetFetchMode) {
|
||||
ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
|
||||
|
||||
// a search result entry needs some work to produce a result Entry:
|
||||
// - check if url entry exists in LURL-db
|
||||
|
@ -280,7 +280,7 @@ public final class plasmaSearchEvent {
|
|||
// find the url entry
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
final URLMetadata metadata = page.metadata();
|
||||
final URLMetadataRow.Components metadata = page.metadata();
|
||||
final String pagetitle = metadata.dc_title().toLowerCase();
|
||||
if (metadata.url() == null) {
|
||||
registerFailure(page.hash(), "url corrupted (null)");
|
||||
|
@ -304,7 +304,7 @@ public final class plasmaSearchEvent {
|
|||
|
||||
// check constraints
|
||||
if ((query.constraint != null) &&
|
||||
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
|
||||
(query.constraint.get(Condenser.flag_cat_indexof)) &&
|
||||
(!(metadata.dc_title().startsWith("Index of")))) {
|
||||
final Iterator<String> wi = query.queryHashes.iterator();
|
||||
while (wi.hasNext()) try { wordIndex.index().remove(wi.next(), page.hash()); } catch (IOException e) {}
|
||||
|
@ -337,7 +337,7 @@ public final class plasmaSearchEvent {
|
|||
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
|
||||
// attach text snippet
|
||||
startTime = System.currentTimeMillis();
|
||||
final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
|
||||
final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
|
||||
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
||||
Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
||||
|
||||
|
@ -512,7 +512,7 @@ public final class plasmaSearchEvent {
|
|||
public void run() {
|
||||
|
||||
// start fetching urls and snippets
|
||||
MetadataRowContainer page;
|
||||
URLMetadataRow page;
|
||||
final int fetchAhead = snippetMode == 0 ? 0 : 10;
|
||||
while (System.currentTimeMillis() < this.timeout) {
|
||||
this.lastLifeSign = System.currentTimeMillis();
|
||||
|
@ -803,8 +803,8 @@ public final class plasmaSearchEvent {
|
|||
|
||||
public static class ResultEntry {
|
||||
// payload objects
|
||||
private final MetadataRowContainer urlentry;
|
||||
private final URLMetadata urlcomps; // buffer for components
|
||||
private final URLMetadataRow urlentry;
|
||||
private final URLMetadataRow.Components urlcomps; // buffer for components
|
||||
private String alternative_urlstring;
|
||||
private String alternative_urlname;
|
||||
private final plasmaSnippetCache.TextSnippet textSnippet;
|
||||
|
@ -813,7 +813,7 @@ public final class plasmaSearchEvent {
|
|||
// statistic objects
|
||||
public long dbRetrievalTime, snippetComputationTime;
|
||||
|
||||
public ResultEntry(final MetadataRowContainer urlentry, final plasmaWordIndex wordIndex,
|
||||
public ResultEntry(final URLMetadataRow urlentry, final plasmaWordIndex wordIndex,
|
||||
final plasmaSnippetCache.TextSnippet textSnippet,
|
||||
final ArrayList<plasmaSnippetCache.MediaSnippet> mediaSnippets,
|
||||
final long dbRetrievalTime, final long snippetComputationTime) {
|
||||
|
@ -837,7 +837,7 @@ public final class plasmaSearchEvent {
|
|||
// seed is not known from here
|
||||
try {
|
||||
wordIndex.index().remove(
|
||||
Word.words2hashes(plasmaCondenser.getWords(
|
||||
Word.words2hashes(Condenser.getWords(
|
||||
("yacyshare " +
|
||||
filename.replace('?', ' ') +
|
||||
" " +
|
||||
|
@ -899,10 +899,10 @@ public final class plasmaSearchEvent {
|
|||
public int lapp() {
|
||||
return urlentry.lapp();
|
||||
}
|
||||
public ReferenceVars word() {
|
||||
public WordReferenceVars word() {
|
||||
final Reference word = urlentry.word();
|
||||
assert word instanceof ReferenceVars;
|
||||
return (ReferenceVars) word;
|
||||
assert word instanceof WordReferenceVars;
|
||||
return (WordReferenceVars) word;
|
||||
}
|
||||
public boolean hasTextSnippet() {
|
||||
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);
|
||||
|
|
|
@ -31,8 +31,9 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -234,7 +235,7 @@ public final class plasmaSearchQuery {
|
|||
public static final boolean matches(final String text, final TreeSet<String> keyhashes) {
|
||||
// returns true if any of the word hashes in keyhashes appear in the String text
|
||||
// to do this, all words in the string must be recognized and transcoded to word hashes
|
||||
final TreeSet<String> wordhashes = Word.words2hashes(plasmaCondenser.getWords(text).keySet());
|
||||
final TreeSet<String> wordhashes = Word.words2hashes(Condenser.getWords(text).keySet());
|
||||
return SetTools.anymatch(wordhashes, keyhashes);
|
||||
}
|
||||
|
||||
|
|
|
@ -39,16 +39,16 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.kelondro.index.BinSearch;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceOrder;
|
||||
import de.anomic.kelondro.text.ReferenceVars;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.SortStack;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverProfiling;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
@ -59,8 +59,8 @@ public final class plasmaSearchRankingProcess {
|
|||
private static boolean useYBR = true;
|
||||
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
|
||||
|
||||
private final SortStack<ReferenceVars> stack;
|
||||
private final HashMap<String, SortStack<ReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
||||
private final SortStack<WordReferenceVars> stack;
|
||||
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
||||
private final HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
|
||||
private final plasmaSearchQuery query;
|
||||
private final int maxentries;
|
||||
|
@ -83,8 +83,8 @@ public final class plasmaSearchRankingProcess {
|
|||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
||||
this.localSearchContainerMaps = null;
|
||||
this.stack = new SortStack<ReferenceVars>(maxentries);
|
||||
this.doubleDomCache = new HashMap<String, SortStack<ReferenceVars>>();
|
||||
this.stack = new SortStack<WordReferenceVars>(maxentries);
|
||||
this.doubleDomCache = new HashMap<String, SortStack<WordReferenceVars>>();
|
||||
this.handover = new HashMap<String, String>();
|
||||
this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang);
|
||||
this.query = query;
|
||||
|
@ -103,7 +103,7 @@ public final class plasmaSearchRankingProcess {
|
|||
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
||||
}
|
||||
|
||||
public long ranking(final ReferenceVars word) {
|
||||
public long ranking(final WordReferenceVars word) {
|
||||
return order.cardinal(word);
|
||||
}
|
||||
|
||||
|
@ -148,13 +148,13 @@ public final class plasmaSearchRankingProcess {
|
|||
long timer = System.currentTimeMillis();
|
||||
|
||||
// normalize entries
|
||||
final ArrayList<ReferenceVars> decodedEntries = this.order.normalizeWith(index);
|
||||
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
|
||||
|
||||
// iterate over normalized entries and select some that are better than currently stored
|
||||
timer = System.currentTimeMillis();
|
||||
final Iterator<ReferenceVars> i = decodedEntries.iterator();
|
||||
ReferenceVars iEntry;
|
||||
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
||||
WordReferenceVars iEntry;
|
||||
Long r;
|
||||
while (i.hasNext()) {
|
||||
iEntry = i.next();
|
||||
|
@ -175,10 +175,10 @@ public final class plasmaSearchRankingProcess {
|
|||
|
||||
// check document domain
|
||||
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
|
||||
}
|
||||
|
||||
// check tld domain
|
||||
|
@ -252,10 +252,10 @@ public final class plasmaSearchRankingProcess {
|
|||
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
|
||||
|
||||
|
||||
private SortStack<ReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
|
||||
private SortStack<WordReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
|
||||
// returns from the current RWI list the best entry and removes this entry from the list
|
||||
SortStack<ReferenceVars> m;
|
||||
SortStack<ReferenceVars>.stackElement rwi;
|
||||
SortStack<WordReferenceVars> m;
|
||||
SortStack<WordReferenceVars>.stackElement rwi;
|
||||
while (stack.size() > 0) {
|
||||
rwi = stack.pop();
|
||||
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
|
||||
|
@ -265,7 +265,7 @@ public final class plasmaSearchRankingProcess {
|
|||
m = this.doubleDomCache.get(domhash);
|
||||
if (m == null) {
|
||||
// first appearance of dom
|
||||
m = new SortStack<ReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
|
||||
m = new SortStack<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
|
||||
this.doubleDomCache.put(domhash, m);
|
||||
return rwi;
|
||||
}
|
||||
|
@ -274,9 +274,9 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
||||
// find best entry from all caches
|
||||
final Iterator<SortStack<ReferenceVars>> i = this.doubleDomCache.values().iterator();
|
||||
SortStack<ReferenceVars>.stackElement bestEntry = null;
|
||||
SortStack<ReferenceVars>.stackElement o;
|
||||
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
||||
SortStack<WordReferenceVars>.stackElement bestEntry = null;
|
||||
SortStack<WordReferenceVars>.stackElement o;
|
||||
while (i.hasNext()) {
|
||||
m = i.next();
|
||||
if (m == null) continue;
|
||||
|
@ -298,15 +298,15 @@ public final class plasmaSearchRankingProcess {
|
|||
return bestEntry;
|
||||
}
|
||||
|
||||
public MetadataRowContainer bestURL(final boolean skipDoubleDom) {
|
||||
public URLMetadataRow bestURL(final boolean skipDoubleDom) {
|
||||
// returns from the current RWI list the best URL entry and removed this entry from the list
|
||||
while ((stack.size() > 0) || (size() > 0)) {
|
||||
if (((stack.size() == 0) && (size() == 0))) break;
|
||||
final SortStack<ReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
|
||||
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
|
||||
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
|
||||
final MetadataRowContainer u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
|
||||
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
|
||||
if (u != null) {
|
||||
final URLMetadata metadata = u.metadata();
|
||||
final URLMetadataRow.Components metadata = u.metadata();
|
||||
if (metadata.url() != null) this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
|
||||
return u;
|
||||
}
|
||||
|
@ -318,7 +318,7 @@ public final class plasmaSearchRankingProcess {
|
|||
public int size() {
|
||||
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
|
||||
int c = stack.size();
|
||||
final Iterator<SortStack<ReferenceVars>> i = this.doubleDomCache.values().iterator();
|
||||
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
||||
while (i.hasNext()) c += i.next().size();
|
||||
return c;
|
||||
}
|
||||
|
@ -355,7 +355,7 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
|
||||
public Reference remove(final String urlHash) {
|
||||
final SortStack<ReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
|
||||
final SortStack<WordReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
|
||||
if (se == null) return null;
|
||||
urlhashes.remove(urlHash);
|
||||
return se.element;
|
||||
|
|
|
@ -43,13 +43,14 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
|
|||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.http.httpClient;
|
||||
import de.anomic.http.httpResponseHeader;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.yacy.yacySearch;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
|
@ -302,7 +303,7 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static TextSnippet retrieveTextSnippet(final URLMetadata comp, final Set<String> queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) {
|
||||
public static TextSnippet retrieveTextSnippet(final URLMetadataRow.Components comp, final Set<String> queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) {
|
||||
// heise = "0OQUNU3JSs05"
|
||||
final yacyURL url = comp.url();
|
||||
if (queryhashes.size() == 0) {
|
||||
|
@ -796,7 +797,7 @@ public class plasmaSnippetCache {
|
|||
private static HashMap<String, Integer> hashSentence(final String sentence) {
|
||||
// generates a word-wordPos mapping
|
||||
final HashMap<String, Integer> map = new HashMap<String, Integer>();
|
||||
final Enumeration<StringBuilder> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
|
||||
final Enumeration<StringBuilder> words = Condenser.wordTokenizer(sentence, "UTF-8");
|
||||
int pos = 0;
|
||||
StringBuilder word;
|
||||
String hash;
|
||||
|
|
|
@ -122,6 +122,7 @@ import de.anomic.crawler.ResultURLs;
|
|||
import de.anomic.crawler.RobotsTxt;
|
||||
import de.anomic.crawler.ZURL;
|
||||
import de.anomic.crawler.CrawlProfile.entry;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.URLLicense;
|
||||
import de.anomic.data.blogBoard;
|
||||
import de.anomic.data.blogBoardComments;
|
||||
|
@ -139,18 +140,17 @@ import de.anomic.http.httpd;
|
|||
import de.anomic.http.httpdRobotsTxtConfig;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.order.NaturalOrder;
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.SetTools;
|
||||
import de.anomic.net.UPnP;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.server.serverAbstractSwitch;
|
||||
import de.anomic.server.serverBusyThread;
|
||||
import de.anomic.server.serverCore;
|
||||
|
@ -930,7 +930,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
if (urlhash.length() == 0) return null;
|
||||
final yacyURL ne = crawlQueues.getURL(urlhash);
|
||||
if (ne != null) return ne;
|
||||
final MetadataRowContainer le = webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow le = webIndex.metadata().load(urlhash, null, 0);
|
||||
if (le != null) return le.metadata().url();
|
||||
return null;
|
||||
}
|
||||
|
@ -1242,11 +1242,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
public static class indexingQueueEntry extends serverProcessorJob {
|
||||
public IndexingStack.QueueEntry queueEntry;
|
||||
public plasmaParserDocument document;
|
||||
public plasmaCondenser condenser;
|
||||
public Condenser condenser;
|
||||
public indexingQueueEntry(
|
||||
final IndexingStack.QueueEntry queueEntry,
|
||||
final plasmaParserDocument document,
|
||||
final plasmaCondenser condenser) {
|
||||
final Condenser condenser) {
|
||||
super();
|
||||
this.queueEntry = queueEntry;
|
||||
this.document = document;
|
||||
|
@ -1595,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
// strip out words and generate statistics
|
||||
if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
|
||||
try {
|
||||
plasmaCondenser condenser = new plasmaCondenser(in.document, in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
|
||||
Condenser condenser = new Condenser(in.document, in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
|
||||
|
||||
// update image result list statistics
|
||||
// its good to do this concurrently here, because it needs a DNS lookup
|
||||
|
@ -1623,7 +1623,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
in.queueEntry.close();
|
||||
}
|
||||
|
||||
private void storeDocumentIndex(final IndexingStack.QueueEntry queueEntry, final plasmaParserDocument document, final plasmaCondenser condenser) {
|
||||
private void storeDocumentIndex(final IndexingStack.QueueEntry queueEntry, final plasmaParserDocument document, final Condenser condenser) {
|
||||
|
||||
// CREATE INDEX
|
||||
final String dc_title = document.dc_title();
|
||||
|
@ -1634,7 +1634,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
|
||||
|
||||
// STORE URL TO LOADED-URL-DB
|
||||
MetadataRowContainer newEntry = null;
|
||||
URLMetadataRow newEntry = null;
|
||||
try {
|
||||
newEntry = webIndex.storeDocument(queueEntry, document, condenser);
|
||||
} catch (final IOException e) {
|
||||
|
@ -1682,9 +1682,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
public class receiptSending implements Runnable {
|
||||
yacySeed initiatorPeer;
|
||||
MetadataRowContainer reference;
|
||||
URLMetadataRow reference;
|
||||
|
||||
public receiptSending(final yacySeed initiatorPeer, final MetadataRowContainer reference) {
|
||||
public receiptSending(final yacySeed initiatorPeer, final URLMetadataRow reference) {
|
||||
this.initiatorPeer = initiatorPeer;
|
||||
this.reference = reference;
|
||||
}
|
||||
|
@ -1729,9 +1729,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
if (urlhash == null) return 0;
|
||||
// determine the url string
|
||||
final MetadataRowContainer entry = webIndex.metadata().load(urlhash, null, 0);
|
||||
final URLMetadataRow entry = webIndex.metadata().load(urlhash, null, 0);
|
||||
if (entry == null) return 0;
|
||||
final URLMetadata metadata = entry.metadata();
|
||||
final URLMetadataRow.Components metadata = entry.metadata();
|
||||
if (metadata.url() == null) return 0;
|
||||
|
||||
InputStream resourceContent = null;
|
||||
|
@ -1757,7 +1757,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
// get the word set
|
||||
Set<String> words = null;
|
||||
try {
|
||||
words = new plasmaCondenser(document, true, true).words().keySet();
|
||||
words = new Condenser(document, true, true).words().keySet();
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -317,7 +317,7 @@ public final class plasmaSwitchboardConstants {
|
|||
*
|
||||
* @see DefaultBlacklist for a detailed overview about the syntax of the default implementation
|
||||
*/
|
||||
public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.kelondro.text.DefaultBlacklist";
|
||||
public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.data.DefaultBlacklist";
|
||||
public static final String LIST_BLUE = "plasmaBlueList";
|
||||
public static final String LIST_BLUE_DEFAULT = null;
|
||||
public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords";
|
||||
|
|
|
@ -42,6 +42,7 @@ import de.anomic.kelondro.order.MicroDate;
|
|||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class plasmaWebStructure {
|
||||
|
@ -90,7 +91,7 @@ public class plasmaWebStructure {
|
|||
}
|
||||
}
|
||||
|
||||
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final plasmaCondenser condenser, final Date docDate) {
|
||||
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final Condenser condenser, final Date docDate) {
|
||||
final yacyURL url = document.dc_source();
|
||||
|
||||
// generate citation reference
|
||||
|
|
|
@ -38,6 +38,7 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.IndexingStack;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.http.httpdProxyCacheEntry;
|
||||
import de.anomic.kelondro.blob.BLOBArray;
|
||||
|
@ -47,16 +48,16 @@ import de.anomic.kelondro.text.BufferedIndex;
|
|||
import de.anomic.kelondro.text.BufferedIndexCollection;
|
||||
import de.anomic.kelondro.text.IndexCell;
|
||||
import de.anomic.kelondro.text.IndexCollectionMigration;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.IODispatcher;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.kelondro.util.kelondroException;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.tools.iso639;
|
||||
import de.anomic.xml.RSSFeed;
|
||||
import de.anomic.xml.RSSMessage;
|
||||
|
@ -146,7 +147,7 @@ public final class plasmaWordIndex {
|
|||
new IndexCollectionMigration(
|
||||
indexPrimaryTextLocation,
|
||||
wordOrder,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
entityCacheMaxSize,
|
||||
targetFileSize,
|
||||
maxFileSize,
|
||||
|
@ -156,7 +157,7 @@ public final class plasmaWordIndex {
|
|||
new BufferedIndexCollection(
|
||||
indexPrimaryTextLocation,
|
||||
wordOrder,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
entityCacheMaxSize,
|
||||
useCommons,
|
||||
redundancy,
|
||||
|
@ -167,7 +168,7 @@ public final class plasmaWordIndex {
|
|||
this.index = new IndexCell(
|
||||
new File(indexPrimaryTextLocation, "RICELL"),
|
||||
wordOrder,
|
||||
ReferenceRow.urlEntryRow,
|
||||
WordReferenceRow.urlEntryRow,
|
||||
entityCacheMaxSize,
|
||||
targetFileSize,
|
||||
maxFileSize,
|
||||
|
@ -408,7 +409,7 @@ public final class plasmaWordIndex {
|
|||
* @param outlinksOther
|
||||
* @return
|
||||
*/
|
||||
public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final plasmaCondenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
|
||||
public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
|
||||
int wordCount = 0;
|
||||
final int urlLength = url.toNormalform(true, true).length();
|
||||
final int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
|
||||
|
@ -417,14 +418,14 @@ public final class plasmaWordIndex {
|
|||
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
|
||||
Map.Entry<String, Word> wentry;
|
||||
String word;
|
||||
ReferenceRow ientry;
|
||||
WordReferenceRow ientry;
|
||||
Word wprop;
|
||||
while (i.hasNext()) {
|
||||
wentry = i.next();
|
||||
word = wentry.getKey();
|
||||
wprop = wentry.getValue();
|
||||
assert (wprop.flags != null);
|
||||
ientry = new ReferenceRow(url.hash(),
|
||||
ientry = new WordReferenceRow(url.hash(),
|
||||
urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(),
|
||||
wprop.count,
|
||||
condenser.RESULT_NUMB_WORDS,
|
||||
|
@ -458,7 +459,7 @@ public final class plasmaWordIndex {
|
|||
queuePreStack.close();
|
||||
}
|
||||
|
||||
public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException {
|
||||
public URLMetadataRow storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final Condenser condenser) throws IOException {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
// CREATE INDEX
|
||||
|
@ -511,7 +512,7 @@ public final class plasmaWordIndex {
|
|||
|
||||
// create a new loaded URL db entry
|
||||
final long ldate = System.currentTimeMillis();
|
||||
final MetadataRowContainer newEntry = new MetadataRowContainer(
|
||||
final URLMetadataRow newEntry = new URLMetadataRow(
|
||||
entry.url(), // URL
|
||||
dc_title, // document description
|
||||
document.dc_creator(), // author
|
||||
|
@ -649,7 +650,7 @@ public final class plasmaWordIndex {
|
|||
public void run() {
|
||||
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
|
||||
ReferenceContainer container = null;
|
||||
ReferenceRow entry = null;
|
||||
WordReferenceRow entry = null;
|
||||
yacyURL url = null;
|
||||
final HashSet<String> urlHashs = new HashSet<String>();
|
||||
try {
|
||||
|
@ -657,14 +658,14 @@ public final class plasmaWordIndex {
|
|||
while (indexContainerIterator.hasNext() && run) {
|
||||
waiter();
|
||||
container = indexContainerIterator.next();
|
||||
final Iterator<ReferenceRow> containerIterator = container.entries();
|
||||
wordHashNow = container.getWordHash();
|
||||
final Iterator<WordReferenceRow> containerIterator = container.entries();
|
||||
wordHashNow = container.getTermHash();
|
||||
while (containerIterator.hasNext() && run) {
|
||||
waiter();
|
||||
entry = containerIterator.next();
|
||||
// System.out.println("Wordhash: "+wordHash+" UrlHash:
|
||||
// "+entry.getUrlHash());
|
||||
final MetadataRowContainer ue = metadata.load(entry.urlHash(), entry, 0);
|
||||
final URLMetadataRow ue = metadata.load(entry.urlHash(), entry, 0);
|
||||
if (ue == null) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
} else {
|
||||
|
@ -675,9 +676,9 @@ public final class plasmaWordIndex {
|
|||
}
|
||||
}
|
||||
if (urlHashs.size() > 0) try {
|
||||
final int removed = index.remove(container.getWordHash(), urlHashs);
|
||||
Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
|
||||
lastWordHash = container.getWordHash();
|
||||
final int removed = index.remove(container.getTermHash(), urlHashs);
|
||||
Log.logFine("INDEXCLEANER", container.getTermHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
|
||||
lastWordHash = container.getTermHash();
|
||||
lastDeletionCounter = urlHashs.size();
|
||||
urlHashs.clear();
|
||||
} catch (IOException e) {
|
||||
|
@ -686,10 +687,10 @@ public final class plasmaWordIndex {
|
|||
|
||||
if (!containerIterator.hasNext()) {
|
||||
// We may not be finished yet, try to get the next chunk of wordHashes
|
||||
final TreeSet<ReferenceContainer> containers = index.references(container.getWordHash(), false, 100, false);
|
||||
final TreeSet<ReferenceContainer> containers = index.references(container.getTermHash(), false, 100, false);
|
||||
indexContainerIterator = containers.iterator();
|
||||
// Make sure we don't get the same wordhash twice, but don't skip a word
|
||||
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) {
|
||||
if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) {
|
||||
indexContainerIterator = containers.iterator();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,8 +35,8 @@ import java.util.Map;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.BufferedIndex;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.server.serverProcessor;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
|
@ -181,7 +181,7 @@ public class Dispatcher {
|
|||
(System.currentTimeMillis() < timeout) &&
|
||||
((container = indexContainerIterator.next()) != null) &&
|
||||
((containers.size() == 0) ||
|
||||
(Base64Order.enhancedComparator.compare(container.getWordHash(), limitHash) < 0))
|
||||
(Base64Order.enhancedComparator.compare(container.getTermHash(), limitHash) < 0))
|
||||
|
||||
) {
|
||||
if (container.size() == 0) continue;
|
||||
|
@ -190,15 +190,15 @@ public class Dispatcher {
|
|||
}
|
||||
// then remove the container from the backend
|
||||
HashSet<String> urlHashes = new HashSet<String>();
|
||||
Iterator<ReferenceRow> it;
|
||||
Iterator<WordReferenceRow> it;
|
||||
for (ReferenceContainer c: containers) {
|
||||
urlHashes.clear();
|
||||
it = c.entries();
|
||||
while (it.hasNext()) {
|
||||
urlHashes.add(it.next().urlHash());
|
||||
}
|
||||
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getWordHash() + "'");
|
||||
if (urlHashes.size() > 0) this.backend.remove(c.getWordHash(), urlHashes);
|
||||
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHash() + "'");
|
||||
if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes);
|
||||
}
|
||||
|
||||
// finished. The caller must take care of the containers and must put them back if not needed
|
||||
|
@ -222,15 +222,15 @@ public class Dispatcher {
|
|||
|
||||
// check all entries and split them to the partitions
|
||||
ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount];
|
||||
ReferenceRow re;
|
||||
WordReferenceRow re;
|
||||
for (ReferenceContainer container: containers) {
|
||||
// init the new partitions
|
||||
for (int j = 0; j < partitionBuffer.length; j++) {
|
||||
partitionBuffer[j] = new ReferenceContainer(container.getWordHash(), container.row(), container.size() / partitionCount);
|
||||
partitionBuffer[j] = new ReferenceContainer(container.getTermHash(), container.row(), container.size() / partitionCount);
|
||||
}
|
||||
|
||||
// split the container
|
||||
Iterator<ReferenceRow> i = container.entries();
|
||||
Iterator<WordReferenceRow> i = container.entries();
|
||||
while (i.hasNext()) {
|
||||
re = i.next();
|
||||
if (re == null) continue;
|
||||
|
@ -263,7 +263,7 @@ public class Dispatcher {
|
|||
for (int vertical = 0; vertical < containers.length; vertical++) {
|
||||
// the 'new' primary target is the word hash of the last container
|
||||
lastContainer = containers[vertical].get(containers[vertical].size() - 1);
|
||||
primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getWordHash(), vertical));
|
||||
primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getTermHash(), vertical));
|
||||
|
||||
// get or make a entry object
|
||||
entry = this.transmissionCloud.get(primaryTarget); // if this is not null, the entry is extended here
|
||||
|
|
|
@ -32,11 +32,11 @@ import java.util.Iterator;
|
|||
|
||||
import de.anomic.kelondro.index.Row;
|
||||
import de.anomic.kelondro.text.Index;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerCache;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.server.serverProcessorJob;
|
||||
|
@ -88,7 +88,7 @@ public class Transmission {
|
|||
*/
|
||||
private String primaryTarget;
|
||||
private ReferenceContainerCache containers;
|
||||
private HashMap<String, MetadataRowContainer> references;
|
||||
private HashMap<String, URLMetadataRow> references;
|
||||
private HashSet<String> badReferences;
|
||||
private ArrayList<yacySeed> targets;
|
||||
private int hit, miss;
|
||||
|
@ -109,7 +109,7 @@ public class Transmission {
|
|||
this.primaryTarget = primaryTarget;
|
||||
this.containers = new ReferenceContainerCache(payloadrow, plasmaWordIndex.wordOrder);
|
||||
this.containers.initWriteMode();
|
||||
this.references = new HashMap<String, MetadataRowContainer>();
|
||||
this.references = new HashMap<String, URLMetadataRow>();
|
||||
this.badReferences = new HashSet<String>();
|
||||
this.targets = targets;
|
||||
this.hit = 0;
|
||||
|
@ -123,12 +123,12 @@ public class Transmission {
|
|||
*/
|
||||
public void add(ReferenceContainer container) {
|
||||
// iterate through the entries in the container and check if the reference is in the repository
|
||||
Iterator<ReferenceRow> i = container.entries();
|
||||
Iterator<WordReferenceRow> i = container.entries();
|
||||
ArrayList<String> notFound = new ArrayList<String>();
|
||||
while (i.hasNext()) {
|
||||
ReferenceRow e = i.next();
|
||||
WordReferenceRow e = i.next();
|
||||
if (references.containsKey(e.urlHash()) || badReferences.contains(e.urlHash())) continue;
|
||||
MetadataRowContainer r = repository.load(e.urlHash(), null, 0);
|
||||
URLMetadataRow r = repository.load(e.urlHash(), null, 0);
|
||||
if (r == null) {
|
||||
notFound.add(e.urlHash());
|
||||
badReferences.add(e.urlHash());
|
||||
|
@ -204,7 +204,7 @@ public class Transmission {
|
|||
Iterator<ReferenceContainer> i = this.containers.iterator();
|
||||
ReferenceContainer firstContainer = (i == null) ? null : i.next();
|
||||
log.logInfo("Index transfer of " + this.containers.size() +
|
||||
" words [" + ((firstContainer == null) ? null : firstContainer.getWordHash()) + " .. " + this.primaryTarget + "]" +
|
||||
" words [" + ((firstContainer == null) ? null : firstContainer.getTermHash()) + " .. " + this.primaryTarget + "]" +
|
||||
" and " + this.references.size() + " URLs" +
|
||||
" to peer " + target.getName() + ":" + target.hash +
|
||||
" in " + (transferTime / 1000) +
|
||||
|
|
|
@ -60,6 +60,7 @@ import org.apache.commons.httpclient.methods.multipart.Part;
|
|||
|
||||
import de.anomic.crawler.HTTPLoader;
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.http.DefaultCharsetFilePart;
|
||||
import de.anomic.http.DefaultCharsetStringPart;
|
||||
import de.anomic.http.httpClient;
|
||||
|
@ -69,14 +70,11 @@ import de.anomic.http.httpRequestHeader;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerCache;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.URLMetadata;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.ByteBuffer;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess;
|
||||
|
@ -85,6 +83,7 @@ import de.anomic.plasma.plasmaSnippetCache;
|
|||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverDomains;
|
||||
import de.anomic.tools.crypt;
|
||||
|
@ -533,15 +532,15 @@ public final class yacyClient {
|
|||
}
|
||||
|
||||
// insert results to containers
|
||||
MetadataRowContainer urlEntry;
|
||||
URLMetadataRow urlEntry;
|
||||
final String[] urls = new String[results];
|
||||
for (int n = 0; n < results; n++) {
|
||||
// get one single search result
|
||||
urlEntry = MetadataRowContainer.importEntry(result.get("resource" + n));
|
||||
urlEntry = URLMetadataRow.importEntry(result.get("resource" + n));
|
||||
if (urlEntry == null) continue;
|
||||
assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash();
|
||||
if (urlEntry.hash().length() != 12) continue; // bad url hash
|
||||
final URLMetadata metadata = urlEntry.metadata();
|
||||
final URLMetadataRow.Components metadata = urlEntry.metadata();
|
||||
if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
|
||||
yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
|
||||
continue; // block with backlist
|
||||
|
@ -796,7 +795,7 @@ public final class yacyClient {
|
|||
return "wrong protocol: " + protocol;
|
||||
}
|
||||
|
||||
public static HashMap<String, String> crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final MetadataRowContainer entry, final String wordhashes) {
|
||||
public static HashMap<String, String> crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final URLMetadataRow entry, final String wordhashes) {
|
||||
assert (target != null);
|
||||
assert (mySeed != null);
|
||||
assert (mySeed != target);
|
||||
|
@ -859,7 +858,7 @@ public final class yacyClient {
|
|||
public static String transferIndex(
|
||||
final yacySeed targetSeed,
|
||||
final ReferenceContainerCache indexes,
|
||||
final HashMap<String, MetadataRowContainer> urlCache,
|
||||
final HashMap<String, URLMetadataRow> urlCache,
|
||||
final boolean gzipBody,
|
||||
final int timeout) {
|
||||
|
||||
|
@ -868,7 +867,7 @@ public final class yacyClient {
|
|||
try {
|
||||
|
||||
// check if we got all necessary urls in the urlCache (only for debugging)
|
||||
Iterator<ReferenceRow> eenum;
|
||||
Iterator<WordReferenceRow> eenum;
|
||||
Reference entry;
|
||||
for (ReferenceContainer ic: indexes) {
|
||||
eenum = ic.entries();
|
||||
|
@ -911,7 +910,7 @@ public final class yacyClient {
|
|||
if (uhs.length == 0) { return null; } // all url's known
|
||||
|
||||
// extract the urlCache from the result
|
||||
final MetadataRowContainer[] urls = new MetadataRowContainer[uhs.length];
|
||||
final URLMetadataRow[] urls = new URLMetadataRow[uhs.length];
|
||||
for (int i = 0; i < uhs.length; i++) {
|
||||
urls[i] = urlCache.get(uhs[i]);
|
||||
if (urls[i] == null) {
|
||||
|
@ -963,13 +962,13 @@ public final class yacyClient {
|
|||
|
||||
int indexcount = 0;
|
||||
final StringBuilder entrypost = new StringBuilder(indexes.size() * 73);
|
||||
Iterator<ReferenceRow> eenum;
|
||||
Iterator<WordReferenceRow> eenum;
|
||||
Reference entry;
|
||||
for (ReferenceContainer ic: indexes) {
|
||||
eenum = ic.entries();
|
||||
while (eenum.hasNext()) {
|
||||
entry = eenum.next();
|
||||
entrypost.append(ic.getWordHash())
|
||||
entrypost.append(ic.getTermHash())
|
||||
.append(entry.toPropertyForm())
|
||||
.append(serverCore.CRLF_STRING);
|
||||
indexcount++;
|
||||
|
@ -1001,7 +1000,7 @@ public final class yacyClient {
|
|||
}
|
||||
}
|
||||
|
||||
private static HashMap<String, String> transferURL(final yacySeed targetSeed, final MetadataRowContainer[] urls, boolean gzipBody, final int timeout) {
|
||||
private static HashMap<String, String> transferURL(final yacySeed targetSeed, final URLMetadataRow[] urls, boolean gzipBody, final int timeout) {
|
||||
// this post a message to the remote message board
|
||||
final String address = targetSeed.getPublicAddress();
|
||||
if (address == null) { return null; }
|
||||
|
|
|
@ -50,7 +50,7 @@ import java.util.HashSet;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
||||
public class yacyNewsPool {
|
||||
|
|
|
@ -51,8 +51,8 @@ import java.util.Set;
|
|||
import java.util.TreeMap;
|
||||
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaSearchQuery;
|
||||
|
|
|
@ -57,9 +57,9 @@ import java.util.TreeMap;
|
|||
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.net.natLib;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.server.serverCodings;
|
||||
import de.anomic.server.serverDomains;
|
||||
import de.anomic.server.serverSystem;
|
||||
|
|
|
@ -35,10 +35,10 @@ import java.net.MalformedURLException;
|
|||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import de.anomic.kelondro.text.Document;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.parser.Document;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class ymageOSM {
|
||||
|
|
|
@ -56,12 +56,11 @@ import de.anomic.kelondro.blob.BLOBHeap;
|
|||
import de.anomic.kelondro.blob.MapDataMining;
|
||||
import de.anomic.kelondro.index.RowCollection;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.ScoreCluster;
|
||||
|
@ -70,6 +69,7 @@ import de.anomic.kelondro.util.FileUtils;
|
|||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverSemaphore;
|
||||
import de.anomic.server.serverSystem;
|
||||
|
@ -689,13 +689,13 @@ public final class yacy {
|
|||
wordIdxContainer = indexContainerIterator.next();
|
||||
|
||||
// the combined container will fit, read the container
|
||||
final Iterator<ReferenceRow> wordIdxEntries = wordIdxContainer.entries();
|
||||
final Iterator<WordReferenceRow> wordIdxEntries = wordIdxContainer.entries();
|
||||
Reference iEntry;
|
||||
while (wordIdxEntries.hasNext()) {
|
||||
iEntry = wordIdxEntries.next();
|
||||
final String urlHash = iEntry.urlHash();
|
||||
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
|
||||
final MetadataRowContainer urlEntry = currentUrlDB.load(urlHash, null, 0);
|
||||
final URLMetadataRow urlEntry = currentUrlDB.load(urlHash, null, 0);
|
||||
urlCounter++;
|
||||
minimizedUrlDB.store(urlEntry);
|
||||
if (urlCounter % 500 == 0) {
|
||||
|
@ -705,7 +705,7 @@ public final class yacy {
|
|||
}
|
||||
|
||||
if (wordCounter%500 == 0) {
|
||||
wordChunkEndHash = wordIdxContainer.getWordHash();
|
||||
wordChunkEndHash = wordIdxContainer.getTermHash();
|
||||
wordChunkEnd = System.currentTimeMillis();
|
||||
final long duration = wordChunkEnd - wordChunkStart;
|
||||
log.logInfo(wordCounter + " words scanned " +
|
||||
|
@ -881,10 +881,10 @@ public final class yacy {
|
|||
while (indexContainerIterator.hasNext()) {
|
||||
counter++;
|
||||
container = indexContainerIterator.next();
|
||||
bos.write((container.getWordHash()).getBytes());
|
||||
bos.write((container.getTermHash()).getBytes());
|
||||
bos.write(serverCore.CRLF);
|
||||
if (counter % 500 == 0) {
|
||||
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
|
||||
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -898,17 +898,17 @@ public final class yacy {
|
|||
while (indexContainerIterator.hasNext()) {
|
||||
counter++;
|
||||
container = indexContainerIterator.next();
|
||||
bos.write((container.getWordHash()).getBytes());
|
||||
bos.write((container.getTermHash()).getBytes());
|
||||
bos.write(serverCore.CRLF);
|
||||
if (counter % 500 == 0) {
|
||||
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
|
||||
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash());
|
||||
}
|
||||
}
|
||||
}
|
||||
bos.flush();
|
||||
bos.close();
|
||||
}
|
||||
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getWordHash()));
|
||||
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getTermHash()));
|
||||
} catch (final IOException e) {
|
||||
log.logSevere("IOException", e);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user