refactoring: better abstraction of reference and metadata prototypes.

This is a preparation to introduce other index tables as used now only for reverse text indexes. Next application of the reverse index is a citation index.
Moved to version 0.74

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5777 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-03 13:23:45 +00:00
parent ab656687d7
commit c2359f20dd
82 changed files with 709 additions and 648 deletions

View File

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5
# Release Configuration
releaseVersion=0.73
releaseVersion=0.74
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

View File

@ -45,11 +45,11 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
import de.anomic.data.listManager;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.kelondro.text.DefaultBlacklist;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;

View File

@ -32,9 +32,9 @@
import java.io.File;
import java.net.MalformedURLException;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;

View File

@ -38,10 +38,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;

View File

@ -41,8 +41,7 @@ import de.anomic.data.listManager;
import de.anomic.data.userDB;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaParserDocument;
@ -184,10 +183,10 @@ public class Bookmarks {
final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
final MetadataRowContainer urlentry = sb.webIndex.metadata().load(urlHash, null, 0);
final URLMetadataRow urlentry = sb.webIndex.metadata().load(urlHash, null, 0);
plasmaParserDocument document = null;
if (urlentry != null) {
final URLMetadata metadata = urlentry.metadata();
final URLMetadataRow.Components metadata = urlentry.metadata();
document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));

View File

@ -31,8 +31,7 @@ import java.util.Iterator;
import java.util.Locale;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -170,8 +169,8 @@ public class CrawlResults {
String urlHash, initiatorHash, executorHash;
String urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
MetadataRowContainer urle;
URLMetadata metadata;
URLMetadataRow urle;
URLMetadataRow.Components metadata;
int i, cnt = 0;
for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) {

View File

@ -34,21 +34,21 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerCache;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.plasma.plasmaSearchAPI;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.parser.Word;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -126,7 +126,7 @@ public class IndexControlRWIs_p {
// generate an urlx array
ReferenceContainer index = null;
index = sb.webIndex.index().get(keyhash, null);
final Iterator<ReferenceRow> en = index.entries();
final Iterator<WordReferenceRow> en = index.entries();
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
@ -207,11 +207,11 @@ public class IndexControlRWIs_p {
final long starttime = System.currentTimeMillis();
index = sb.webIndex.index().get(keyhash, null);
// built urlCache
final Iterator<ReferenceRow> urlIter = index.entries();
final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>();
final Iterator<WordReferenceRow> urlIter = index.entries();
final HashMap<String, URLMetadataRow> knownURLs = new HashMap<String, URLMetadataRow>();
final HashSet<String> unknownURLEntries = new HashSet<String>();
Reference iEntry;
MetadataRowContainer lurl;
URLMetadataRow lurl;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
lurl = sb.webIndex.metadata().load(iEntry.urlHash(), null, 0);
@ -251,7 +251,7 @@ public class IndexControlRWIs_p {
prop.put("keyhashsimilar", "1");
while (containerIt.hasNext() && i < 256) {
container = containerIt.next();
prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getWordHash());
prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
cols++;
if (cols==8) {
prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
@ -278,7 +278,7 @@ public class IndexControlRWIs_p {
yacyURL url;
for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
final MetadataRowContainer e = sb.webIndex.metadata().load(urlx[i], null, 0);
final URLMetadataRow e = sb.webIndex.metadata().load(urlx[i], null, 0);
sb.webIndex.metadata().remove(urlx[i]);
if (e != null) {
url = e.metadata().url();
@ -306,7 +306,7 @@ public class IndexControlRWIs_p {
yacyURL url;
for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
final MetadataRowContainer e = sb.webIndex.metadata().load(urlx[i], null, 0);
final URLMetadataRow e = sb.webIndex.metadata().load(urlx[i], null, 0);
sb.webIndex.metadata().remove(urlx[i]);
if (e != null) {
url = e.metadata().url();

View File

@ -33,9 +33,8 @@ import java.util.Iterator;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -116,7 +115,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -150,7 +149,7 @@ public class IndexControlURLs_p {
final yacyURL url = new yacyURL(urlstring, null);
urlhash = url.hash();
prop.put("urlhash", urlhash);
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@ -167,7 +166,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashsearch")) {
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else {
@ -182,9 +181,9 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
MetadataRowContainer entry;
URLMetadataRow entry;
int i = 0;
int rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
@ -286,15 +285,15 @@ public class IndexControlURLs_p {
return prop;
}
private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final MetadataRowContainer entry, final String urlhash) {
private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final URLMetadataRow entry, final String urlhash) {
final serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
final URLMetadata metadata = entry.metadata();
final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0);
final URLMetadataRow.Components metadata = entry.metadata();
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0);
if (metadata.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);

View File

@ -31,11 +31,11 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.Row.Entry;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.plasma.plasmaSwitchboard;

View File

@ -31,11 +31,11 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.Row.Entry;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.plasma.plasmaSwitchboard;

View File

@ -39,16 +39,15 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.http.httpClient;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyURL;
@ -95,7 +94,7 @@ public class ViewFile {
final String urlHash = post.get("urlHash","");
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
MetadataRowContainer urlEntry = null;
URLMetadataRow urlEntry = null;
urlEntry = sb.webIndex.metadata().load(urlHash, null, 0);
if (urlEntry == null) {
prop.put("error", "2");
@ -104,7 +103,7 @@ public class ViewFile {
}
// getting the url that belongs to the entry
final URLMetadata metadata = urlEntry.metadata();
final URLMetadataRow.Components metadata = urlEntry.metadata();
if ((metadata == null) || (metadata.url() == null)) {
prop.put("error", "3");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
@ -114,7 +113,7 @@ public class ViewFile {
descr = metadata.dc_title();
urlEntry.wordCount();
size = urlEntry.size();
pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof);
pre = urlEntry.flags().get(Condenser.flag_cat_indexof);
}
// alternatively, get the url simply from a url String
@ -312,7 +311,7 @@ public class ViewFile {
// Search word highlighting
while (sentences.hasNext()) {
sentence = sentences.next().toString();
Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString();
if (token.length() > 0) {

View File

@ -2,9 +2,9 @@
import java.io.File;
import java.util.List;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;

View File

@ -31,7 +31,7 @@ import java.util.TreeSet;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard;
@ -85,8 +85,8 @@ public final class timeline {
localSearchContainerMaps[1].values(),
maxdist);
Iterator<ReferenceRow> i = index.entries();
ReferenceRow entry;
Iterator<WordReferenceRow> i = index.entries();
WordReferenceRow entry;
int c = 0;
Date lm;
String lms;

View File

@ -28,8 +28,7 @@
import java.net.MalformedURLException;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -69,14 +68,14 @@ public class yacydoc {
}
if (urlhash == null || urlhash.length() == 0) return prop;
final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0);
if (entry == null) return prop;
final URLMetadata metadata = entry.metadata();
final URLMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
return prop;
}
final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0);
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0);
prop.putXML("dc_title", metadata.dc_title());
prop.putXML("dc_creator", metadata.dc_creator());

View File

@ -38,11 +38,11 @@ import java.util.HashSet;
import java.util.List;
import de.anomic.crawler.HTTPLoader;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.http.httpClient;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;

View File

@ -31,8 +31,7 @@ import java.io.IOException;
import de.anomic.crawler.ZURL;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -113,14 +112,14 @@ public final class crawlReceipt {
}
// generating a new loaded URL entry
final MetadataRowContainer entry = MetadataRowContainer.importEntry(propStr);
final URLMetadataRow entry = URLMetadataRow.importEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");
return prop;
}
final URLMetadata metadata = entry.metadata();
final URLMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");

View File

@ -32,9 +32,9 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
@ -127,7 +127,7 @@ public final class transferRWI {
int p;
String wordHash;
String urlHash;
ReferenceRow iEntry;
WordReferenceRow iEntry;
final HashSet<String> unknownURL = new HashSet<String>();
final HashSet<String> knownURL = new HashSet<String>();
final String[] wordhashes = new String[v.size()];
@ -147,7 +147,7 @@ public final class transferRWI {
}
wordHash = estring.substring(0, p);
wordhashes[received] = wordHash;
iEntry = new ReferenceRow(estring.substring(p));
iEntry = new WordReferenceRow(estring.substring(p));
urlHash = iEntry.urlHash();
// block blacklisted entries

View File

@ -29,10 +29,9 @@
import java.io.IOException;
import java.text.ParseException;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
@ -85,7 +84,7 @@ public final class transferURL {
final int sizeBefore = sb.webIndex.metadata().size();
// read the urls from the other properties and store
String urls;
MetadataRowContainer lEntry;
URLMetadataRow lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
@ -98,7 +97,7 @@ public final class transferURL {
}
// parse new lurl-entry
lEntry = MetadataRowContainer.importEntry(urls);
lEntry = URLMetadataRow.importEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
blocked++;
@ -106,7 +105,7 @@ public final class transferURL {
}
// check if entry is well-formed
final URLMetadata metadata = lEntry.metadata();
final URLMetadataRow.Components metadata = lEntry.metadata();
if (metadata.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
blocked++;

View File

@ -30,8 +30,7 @@ import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.NoticedURL;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -109,8 +108,8 @@ public class urls {
if (urlhashes.length() % 12 != 0) return prop;
final int count = urlhashes.length() / 12;
int c = 0;
MetadataRowContainer entry;
URLMetadata metadata;
URLMetadataRow entry;
URLMetadataRow.Components metadata;
yacyURL referrer;
for (int i = 0; i < count; i++) {
entry = sb.webIndex.metadata().load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0);

View File

@ -33,13 +33,10 @@ import java.util.TreeSet;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.SetTools;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent;
@ -48,6 +45,8 @@ import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverCore;
import de.anomic.server.serverDomains;
import de.anomic.server.serverObjects;
@ -164,7 +163,7 @@ public class yacysearch {
Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null;
if (indexof) {
constraint = new Bitfield(4);
constraint.set(plasmaCondenser.flag_cat_indexof, true);
constraint.set(Condenser.flag_cat_indexof, true);
}
// SEARCH
@ -342,9 +341,9 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
final MetadataRowContainer urlentry = sb.webIndex.metadata().load(recommendHash, null, 0);
final URLMetadataRow urlentry = sb.webIndex.metadata().load(recommendHash, null, 0);
if (urlentry != null) {
final URLMetadata metadata = urlentry.metadata();
final URLMetadataRow.Components metadata = urlentry.metadata();
plasmaParserDocument document;
document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false);
if (document != null) {

View File

@ -38,13 +38,13 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.http.httpClient;
import de.anomic.kelondro.table.FlexWidthArray;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverProcessorJob;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;

View File

@ -31,8 +31,8 @@ package de.anomic.crawler;
import java.net.UnknownHostException;
import java.util.Date;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
@ -244,7 +244,7 @@ public final class CrawlStacker {
// check if the url is double registered
final String dbocc = nextQueue.urlExists(entry.url().hash());
if (dbocc != null || wordIndex.metadata().exists(entry.url().hash())) {
final MetadataRowContainer oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0);
final URLMetadataRow oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0);
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {

View File

@ -35,13 +35,13 @@ import java.util.Date;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.net.ftpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.yacy.yacyURL;
public class FTPLoader {

View File

@ -28,17 +28,17 @@ package de.anomic.crawler;
import java.io.IOException;
import java.util.Date;
import de.anomic.data.Blacklist;
import de.anomic.http.httpClient;
import de.anomic.http.httpResponse;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.yacy.yacyURL;
public final class HTTPLoader {

View File

@ -39,7 +39,7 @@ import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.table.Stack;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaHTCache;
@ -352,7 +352,7 @@ public class IndexingStack {
if (referrerURL == null) {
// FIXME the equals seems to be incorrect: String.equals(boolean)
if ((referrerHash == null) || ((initiator != null) && (referrerHash.equals(initiator.length() == 0)))) return null;
final MetadataRowContainer entry = wordIndex.metadata().load(referrerHash, null, 0);
final URLMetadataRow entry = wordIndex.metadata().load(referrerHash, null, 0);
if (entry == null) referrerURL = null; else referrerURL = entry.metadata().url();
}
return referrerURL;

View File

@ -23,7 +23,7 @@
package de.anomic.crawler;
import de.anomic.kelondro.text.Document;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverSemaphore;
import de.anomic.yacy.yacyURL;

View File

@ -33,9 +33,9 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverCore;
import de.anomic.server.serverProcessorJob;

View File

@ -40,7 +40,7 @@ import java.util.LinkedList;
import java.util.List;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
import de.anomic.yacy.yacySeedDB;
@ -82,7 +82,7 @@ public final class ResultURLs {
gcrawlResultDomains = new ScoreCluster<String>();
}
public synchronized void stack(final MetadataRowContainer e, final String initiatorHash, final String executorHash, final int stackType) {
public synchronized void stack(final URLMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) {
assert initiatorHash != null;
assert executorHash != null;
if (e == null) { return; }
@ -305,7 +305,7 @@ public final class ResultURLs {
final ResultURLs results = new ResultURLs();
try {
final yacyURL url = new yacyURL("http", "www.yacy.net", 80, "/");
final MetadataRowContainer urlRef = new MetadataRowContainer(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
final URLMetadataRow urlRef = new URLMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0);
int stackNo = 1;
System.out.println("valid test:\n=======");
// add

View File

@ -25,7 +25,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.data;
import java.io.File;
import java.io.IOException;

View File

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.data;
import java.io.File;
import java.util.Arrays;

View File

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.data;
import java.io.File;
import java.util.ArrayList;
@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException;
public class DefaultBlacklist extends AbstractBlacklist implements Blacklist {
public DefaultBlacklist(final File rootPath) {

View File

@ -45,7 +45,7 @@ import de.anomic.http.httpClient;
import de.anomic.http.httpResponse;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpdByteCountInputStream;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
@ -260,7 +260,7 @@ public class SitemapParser extends DefaultHandler {
final String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final MetadataRowContainer oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0);
final URLMetadataRow oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null

View File

@ -55,9 +55,9 @@ import de.anomic.kelondro.index.IntegerHandleIndex;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository.Export;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.yacy.yacyURL;
@ -396,7 +396,7 @@ public class URLAnalysis {
"collection",
12,
Base64Order.enhancedCoder,
ReferenceRow.urlEntryRow);
WordReferenceRow.urlEntryRow);
System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics");
idx.dump(new File(statisticPath));
System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
@ -407,9 +407,9 @@ public class URLAnalysis {
public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
System.out.println("COLLECTION INDEX DIFF URL-COL startup");
IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile), 0);
IntegerHandleIndex idx = new IntegerHandleIndex(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(statisticFile), 0);
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 0, 1000000);
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000);
System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff");
long start = System.currentTimeMillis();
long update = start - 7000;
@ -436,7 +436,7 @@ public class URLAnalysis {
// format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
HandleSet hs = (diffFile == null) ? null : new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0);
HandleSet hs = (diffFile == null) ? null : new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
System.out.println("URL EXPORT loaded dump, starting export");
Export e = mr.export(new File(export), ".*", hs, format, false);
try {
@ -451,7 +451,7 @@ public class URLAnalysis {
System.out.println("URL DELETE startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath));
int mrSize = mr.size();
HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0);
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
for (byte[] refhash: hs) {
mr.remove(new String(refhash));

View File

@ -68,12 +68,12 @@ import de.anomic.kelondro.blob.BLOBTree;
import de.anomic.kelondro.blob.MapView;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.kelondroException;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Word;
import de.anomic.server.serverBusyThread;
import de.anomic.server.serverInstantBusyThread;
import de.anomic.yacy.yacyNewsPool;

View File

@ -42,8 +42,7 @@ import java.util.List;
import java.util.Set;
import java.util.Vector;
import de.anomic.kelondro.text.AbstractBlacklist;
import de.anomic.kelondro.text.Blacklist.blacklistFile;
import de.anomic.data.Blacklist.blacklistFile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;

View File

@ -29,9 +29,9 @@ package de.anomic.http;
import java.util.Date;
import de.anomic.crawler.CrawlProfile;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.parser.Document;
import de.anomic.yacy.yacyURL;
public class httpdProxyCacheEntry implements Document {

View File

@ -72,10 +72,9 @@ import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.htmlFilter.htmlFilterContentTransformer;
import de.anomic.htmlFilter.htmlFilterTransformer;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.FileUtils;
@ -83,6 +82,7 @@ import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverCore;
import de.anomic.server.serverDomains;
import de.anomic.server.serverObjects;

View File

@ -40,13 +40,13 @@ import de.anomic.http.httpChunkedInputStream;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverCore;
import de.anomic.server.serverHandler;
import de.anomic.server.serverCore.Session;

View File

@ -609,30 +609,30 @@ public class BLOBArray implements BLOB {
while (true) {
assert c1 != null;
assert c2 != null;
e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes());
e = ordering.compare(c1.getTermHash().getBytes(), c2.getTermHash().getBytes());
if (e < 0) {
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
writer.add(c1.getTermHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0;
continue;
}
break;
}
if (e > 0) {
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
writer.add(c2.getTermHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0;
continue;
}
break;
}
assert e == 0;
// merge the entries
writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection());
writer.add(c1.getTermHash().getBytes(), (c1.merge(c2)).exportCollection());
if (i1.hasNext() && i2.hasNext()) {
c1 = i1.next();
c2 = i2.next();
@ -647,22 +647,22 @@ public class BLOBArray implements BLOB {
assert !(i1.hasNext() && i2.hasNext());
while (i1.hasNext()) {
//System.out.println("FLUSH REMAINING 1: " + c1.getWordHash());
writer.add(c1.getWordHash().getBytes(), c1.exportCollection());
writer.add(c1.getTermHash().getBytes(), c1.exportCollection());
if (i1.hasNext()) {
c1o = c1;
c1 = i1.next();
assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0;
assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0;
continue;
}
break;
}
while (i2.hasNext()) {
//System.out.println("FLUSH REMAINING 2: " + c2.getWordHash());
writer.add(c2.getWordHash().getBytes(), c2.exportCollection());
writer.add(c2.getTermHash().getBytes(), c2.exportCollection());
if (i2.hasNext()) {
c2o = c2;
c2 = i2.next();
assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0;
assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0;
continue;
}
break;

View File

@ -44,7 +44,7 @@ import de.anomic.kelondro.text.IndexBuffer;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
@ -94,21 +94,21 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
useCommons);
}
/* methods for interface Index */
public void add(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
// add the entry
buffer.add(entries);
cacheFlushControl();
}
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
// add the entry
buffer.add(wordHash, entry);
cacheFlushControl();
@ -151,10 +151,10 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
WordReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
e = new WordReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
@ -164,7 +164,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHash());
return container;
}
@ -172,7 +172,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme
public ReferenceContainer delete(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
buffer.count(wordHash));
c.addAllUnique(buffer.delete(wordHash));
c.addAllUnique(collections.delete(wordHash));

View File

@ -34,6 +34,7 @@ import java.util.TreeSet;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
public interface Index {
@ -52,72 +53,72 @@ public interface Index {
* if no references to the word are stored, the a new entry is added,
* if there are already references to the word hash stored,
* then the old and the new references are merged
* @param wordHash
* @param termHash
* @param entry
* @throws IOException
*/
public void add(final String wordHash, final ReferenceRow entry) throws IOException;
public void add(final String termHash, final WordReferenceRow entry) throws IOException;
/**
* check if there are references stored to the given word hash
* @param wordHash
* @param termHash
* @return true if references exist, false if not
*/
public boolean has(String wordHash); // should only be used if in case that true is returned the getContainer is NOT called
public boolean has(String termHash); // should only be used if in case that true is returned the getContainer is NOT called
/**
* count the number of references for the given word
* do not use this method to check the existence of a reference by comparing
* the result with zero, use hasReferences instead.
* @param wordHash
* @param termHash
* @return the number of references to the given word
*/
public int count(final String wordHash);
public int count(final String termHash);
/**
* get the references to a given word.
* if referenceselection is not null, then all url references which are not
* in referenceselection are removed from the container
* @param wordHash
* @param termHash
* @param referenceselection
* @return the references
* @throws IOException
*/
public ReferenceContainer get(String wordHash, Set<String> referenceselection) throws IOException;
public ReferenceContainer get(String termHash, Set<String> referenceselection) throws IOException;
/**
* delete all references for a word
* @param wordHash
* @param termHash
* @return the deleted references
* @throws IOException
*/
public ReferenceContainer delete(String wordHash) throws IOException;
public ReferenceContainer delete(String termHash) throws IOException;
/**
* remove a specific reference entry
* @param wordHash
* @param termHash
* @param referenceHash the key for the reference entry to be removed
* @return
* @throws IOException
*/
public boolean remove(String wordHash, String referenceHash) throws IOException;
public boolean remove(String termHash, String referenceHash) throws IOException;
/**
* remove a set of reference entries for a given word
* @param wordHash the key for the references
* @param termHash the key for the references
* @param referenceHash the reference entry keys
* @return
* @throws IOException
*/
public int remove(String wordHash, Set<String> referenceHashes) throws IOException;
public int remove(String termHash, Set<String> referenceHashes) throws IOException;
public int remove(final Set<String> wordHashes, final String urlHash) throws IOException;
public int remove(final Set<String> termHashes, final String urlHash) throws IOException;
public void remove(final Set<String> wordHashes, final Set<String> urlHashes) throws IOException;
public void remove(final Set<String> termHashes, final Set<String> urlHashes) throws IOException;
/**
* iterate all references from the beginning of a specific word hash
* @param startWordHash
* @param startHash
* @param rot if true, then rotate at the end to the beginning
* @param ram
* @return

View File

@ -35,6 +35,7 @@ import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
@ -94,8 +95,8 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
} else if (dumpFile.exists()) {
// initialize scores for cache organization
for (final ReferenceContainer ic : (Iterable<ReferenceContainer>) heap.references(null, false)) {
this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote()));
this.hashScore.setScore(ic.getWordHash(), ic.size());
this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote()));
this.hashScore.setScore(ic.getTermHash(), ic.size());
}
} else {
heap.initWriteMode();
@ -197,7 +198,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
}
if (hash == null) {
final ReferenceContainer ic = heap.references(null, false).next();
if (ic != null) hash = ic.getWordHash();
if (ic != null) hash = ic.getTermHash();
}
return hash;
@ -304,11 +305,11 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
// put new words into cache
heap.add(container);
hashScore.setScore(container.getWordHash(), heap.count(container.getWordHash()));
hashDate.setScore(container.getWordHash(), intTime(System.currentTimeMillis()));
hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash()));
hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis()));
}
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
if (entry == null || heap == null) return;
// put new words into cache
@ -335,7 +336,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead
public synchronized long getBufferSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final long entryBytes = WordReferenceRow.urlEntryRow.objectsize;
final Iterator<ReferenceContainer> it = references(null, false);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
return cacheBytes;

View File

@ -36,6 +36,7 @@ import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.server.serverProfiling;
@ -65,15 +66,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
public IndexCell(
final File cellPath,
final ByteOrder wordOrder,
final ByteOrder termOrder,
final Row payloadrow,
final int maxRamEntries,
final long targetFileSize,
final long maxFileSize,
IODispatcher merger
) throws IOException {
this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow, merger);
this.ram = new ReferenceContainerCache(payloadrow, wordOrder);
this.array = new ReferenceContainerArray(cellPath, termOrder, payloadrow, merger);
this.ram = new ReferenceContainerCache(payloadrow, termOrder);
this.ram.initWriteMode();
this.maxRamEntries = maxRamEntries;
this.merger = merger;
@ -99,25 +100,25 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
cleanCache();
}
public synchronized void add(String hash, ReferenceRow entry) throws IOException {
public synchronized void add(String hash, WordReferenceRow entry) throws IOException {
this.ram.add(hash, entry);
serverProfiling.update("wordcache", Long.valueOf(this.ram.size()), true);
cleanCache();
}
/**
* checks if there is any container for this wordHash, either in RAM or any BLOB
* checks if there is any container for this termHash, either in RAM or any BLOB
*/
public boolean has(String wordHash) {
if (this.ram.has(wordHash)) return true;
return this.array.has(wordHash);
public boolean has(String termHash) {
if (this.ram.has(termHash)) return true;
return this.array.has(termHash);
}
public int count(String wordHash) {
ReferenceContainer c0 = this.ram.get(wordHash, null);
public int count(String termHash) {
ReferenceContainer c0 = this.ram.get(termHash, null);
ReferenceContainer c1;
try {
c1 = this.array.get(wordHash);
c1 = this.array.get(termHash);
} catch (IOException e) {
c1 = null;
}
@ -133,9 +134,9 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
* all containers in the BLOBs and the RAM are merged and returned
* @throws IOException
*/
public ReferenceContainer get(String wordHash, Set<String> urlselection) throws IOException {
ReferenceContainer c0 = this.ram.get(wordHash, null);
ReferenceContainer c1 = this.array.get(wordHash);
public ReferenceContainer get(String termHash, Set<String> urlselection) throws IOException {
ReferenceContainer c0 = this.ram.get(termHash, null);
ReferenceContainer c1 = this.array.get(termHash);
if (c1 == null) {
if (c0 == null) return null;
return c0;
@ -149,14 +150,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
* the deleted containers are merged and returned as result of the method
* @throws IOException
*/
public ReferenceContainer delete(String wordHash) throws IOException {
ReferenceContainer c0 = this.ram.delete(wordHash);
ReferenceContainer c1 = this.array.get(wordHash);
public ReferenceContainer delete(String termHash) throws IOException {
ReferenceContainer c0 = this.ram.delete(termHash);
ReferenceContainer c1 = this.array.get(termHash);
if (c1 == null) {
if (c0 == null) return null;
return c0;
}
this.array.delete(wordHash);
this.array.delete(termHash);
cleanCache();
if (c0 == null) return c1;
return c1.merge(c0);
@ -169,13 +170,13 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
* new BLOBs. This returns the sum of all url references that have been removed
* @throws IOException
*/
public int remove(String wordHash, Set<String> urlHashes) throws IOException {
int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHashes));
public int remove(String termHash, Set<String> urlHashes) throws IOException {
int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashes));
return reduced / this.array.rowdef().objectsize;
}
public boolean remove(String wordHash, String urlHash) throws IOException {
int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHash));
public boolean remove(String termHash, String urlHash) throws IOException {
int reduced = this.array.replace(termHash, new RemoveRewriter(urlHash));
return reduced > 0;
}
@ -199,14 +200,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
}
public CloneableIterator<ReferenceContainer> references(String startWordHash, boolean rot) {
public CloneableIterator<ReferenceContainer> references(String starttermHash, boolean rot) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone());
containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0));
containerOrder.rotate(new ReferenceContainer(starttermHash, this.ram.rowdef(), 0));
return new MergeIterator<ReferenceContainer>(
this.ram.references(startWordHash, rot),
this.ram.references(starttermHash, rot),
new MergeIterator<ReferenceContainer>(
this.ram.references(startWordHash, false),
this.array.wordContainerIterator(startWordHash, false, false),
this.ram.references(starttermHash, false),
this.array.wordContainerIterator(starttermHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true),
@ -215,15 +216,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
true);
}
public CloneableIterator<ReferenceContainer> references(String startWordHash, boolean rot, boolean ram) {
public CloneableIterator<ReferenceContainer> references(String startTermHash, boolean rot, boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone());
containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0));
containerOrder.rotate(new ReferenceContainer(startTermHash, this.ram.rowdef(), 0));
if (ram) {
return this.ram.references(startWordHash, rot);
return this.ram.references(startTermHash, rot);
}
return new MergeIterator<ReferenceContainer>(
this.ram.references(startWordHash, false),
this.array.wordContainerIterator(startWordHash, false, false),
this.ram.references(startTermHash, false),
this.array.wordContainerIterator(startTermHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
@ -317,27 +318,22 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
return System.currentTimeMillis();
}
public int getBufferMaxReferences() {
return this.ram.maxReferences();
}
public long getBufferMinAge() {
return System.currentTimeMillis();
}
public int getBufferSize() {
return this.ram.size();
}
public long getBufferSizeBytes() {
return 10000 * this.ram.size(); // guessed; we don't know that exactly because there is no statistics here (expensive, not necessary)
}
public void setBufferMaxWordCount(int maxWords) {
this.maxRamEntries = maxWords;
}

View File

@ -54,6 +54,7 @@ import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.table.EcoTable;
import de.anomic.kelondro.table.FixedWidthArray;
import de.anomic.kelondro.table.FlexTable;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.kelondroException;
@ -250,10 +251,12 @@ public class IndexCollection extends AbstractIndex implements Index {
}
}
public void add(String wordhash, ReferenceRow entry) {
public void add(String wordhash, WordReferenceRow entry) {
if (entry == null) return;
try {
this.merge(new ReferenceContainer(wordhash, entry));
ReferenceContainer container = new ReferenceContainer(wordhash, this.payloadrow, 1);
container.add(entry);
this.merge(container);
} catch (final kelondroOutOfLimitsException e) {
e.printStackTrace();
} catch (final IOException e) {
@ -704,7 +707,7 @@ public class IndexCollection extends AbstractIndex implements Index {
private synchronized void merge(final ReferenceContainer container) throws IOException, kelondroOutOfLimitsException {
if ((container == null) || (container.size() == 0)) return;
final byte[] key = container.getWordHash().getBytes();
final byte[] key = container.getTermHash().getBytes();
// first find an old entry, if one exists
Row.Entry indexrow = index.get(key);

View File

@ -41,7 +41,7 @@ import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
@ -66,7 +66,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
this.cell = new IndexCell(
celldir,
wordOrdering,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
@ -104,7 +104,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
12,
Base64Order.enhancedCoder,
BufferedIndexCollection.maxCollectionPartition,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
false);
if (this.collections.size() == 0) {
// delete everything here
@ -126,10 +126,10 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
/* methods for interface Index */
public void add(final ReferenceContainer entries) throws IOException {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
if (this.collections != null) {
ReferenceContainer e = this.collections.delete(entries.getWordHash());
ReferenceContainer e = this.collections.delete(entries.getTermHash());
if (e != null) {
e.merge(entries);
cell.add(e);
@ -141,7 +141,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
}
}
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
public void add(final String wordHash, final WordReferenceRow entry) throws IOException {
if (this.collections != null) {
ReferenceContainer e = this.collections.delete(wordHash);
if (e != null) {

View File

@ -0,0 +1,89 @@
// Metadata.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 03.04.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Reference;
public interface Metadata {
public Row.Entry toRowEntry();
public String hash();
public long ranking();
public Date moddate();
public Date loaddate();
public Date freshdate();
public String referrerHash();
public String md5();
public char doctype();
public String language();
public int size();
public Bitfield flags();
public int wordCount();
public int llocal();
public int lother();
public int limage();
public int laudio();
public int lvideo();
public int lapp();
public String snippet();
public Reference word();
public boolean isOlder(final Metadata other);
public String toString(final String snippet);
public CrawlEntry toBalancerEntry(final String initiatorHash);
public String toString();
}

View File

@ -38,6 +38,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.data.Blacklist;
import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.http.httpClient;
import de.anomic.http.httpResponse;
@ -48,6 +49,7 @@ import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.ObjectIndex;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.table.SplitTable;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
import de.anomic.yacy.yacyURL;
@ -62,7 +64,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
public MetadataRepository(final File path) {
this.location = path;
this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false));
this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", URLMetadataRow.rowdef, false));
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
@ -97,7 +99,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
return 0;
}
public synchronized MetadataRowContainer load(final String urlHash, final Reference searchedWord, final long ranking) {
public synchronized URLMetadataRow load(final String urlHash, final Reference searchedWord, final long ranking) {
// generates an plasmaLURLEntry using the url hash
// if the url cannot be found, this returns null
if (urlHash == null) return null;
@ -105,15 +107,15 @@ public final class MetadataRepository implements Iterable<byte[]> {
try {
final Row.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new MetadataRowContainer(entry, searchedWord, ranking);
return new URLMetadataRow(entry, searchedWord, ranking);
} catch (final IOException e) {
return null;
}
}
public synchronized void store(final MetadataRowContainer entry) throws IOException {
public synchronized void store(final URLMetadataRow entry) throws IOException {
// Check if there is a more recent Entry already in the DB
MetadataRowContainer oldEntry;
URLMetadataRow oldEntry;
try {
if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null, 0);
@ -166,17 +168,17 @@ public final class MetadataRepository implements Iterable<byte[]> {
return keys(true, null);
}
public CloneableIterator<MetadataRowContainer> entries() throws IOException {
public CloneableIterator<URLMetadataRow> entries() throws IOException {
// enumerates entry elements
return new kiter();
}
public CloneableIterator<MetadataRowContainer> entries(final boolean up, final String firstHash) throws IOException {
public CloneableIterator<URLMetadataRow> entries(final boolean up, final String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
public class kiter implements CloneableIterator<MetadataRowContainer> {
public class kiter implements CloneableIterator<URLMetadataRow> {
// enumerates entry elements
private final Iterator<Row.Entry> iter;
private final boolean error;
@ -208,12 +210,12 @@ public final class MetadataRepository implements Iterable<byte[]> {
return this.iter.hasNext();
}
public final MetadataRowContainer next() {
public final URLMetadataRow next() {
Row.Entry e = null;
if (this.iter == null) { return null; }
if (this.iter.hasNext()) { e = this.iter.next(); }
if (e == null) { return null; }
return new MetadataRowContainer(e, null, 0);
return new URLMetadataRow(e, null, 0);
}
public final void remove() {
@ -232,7 +234,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
final Log log = new Log("URLDBCLEANUP");
final HashSet<String> damagedURLS = new HashSet<String>();
try {
final Iterator<MetadataRowContainer> eiter = entries(true, null);
final Iterator<URLMetadataRow> eiter = entries(true, null);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
@ -325,7 +327,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
public void run() {
try {
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
final Iterator<MetadataRowContainer> eiter = entries(true, null);
final Iterator<URLMetadataRow> eiter = entries(true, null);
while (eiter.hasNext() && run) {
synchronized (this) {
if (this.pause) {
@ -338,13 +340,13 @@ public final class MetadataRepository implements Iterable<byte[]> {
}
}
}
final MetadataRowContainer entry = eiter.next();
final URLMetadataRow entry = eiter.next();
if (entry == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
} else if (entry.hash() == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + "hash == null");
} else {
final URLMetadata metadata = entry.metadata();
final URLMetadataRow.Components metadata = entry.metadata();
totalSearchedUrls++;
if (metadata.url() == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null");
@ -468,9 +470,9 @@ public final class MetadataRepository implements Iterable<byte[]> {
count++;
}
} else {
final Iterator<MetadataRowContainer> i = entries(); // iterates indexURLEntry objects
MetadataRowContainer entry;
URLMetadata metadata;
final Iterator<URLMetadataRow> i = entries(); // iterates indexURLEntry objects
URLMetadataRow entry;
URLMetadataRow.Components metadata;
String url;
while (i.hasNext()) {
entry = i.next();
@ -552,7 +554,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
HashMap<String, hashStat> map = domainSampleCollector();
// fetch urls from the database to determine the host in clear text
MetadataRowContainer urlref;
URLMetadataRow urlref;
if (count < 0 || count > map.size()) count = map.size();
statsDump = new ArrayList<hostStat>();
TreeSet<String> set = new TreeSet<String>();
@ -582,12 +584,12 @@ public final class MetadataRepository implements Iterable<byte[]> {
// fetch urls from the database to determine the host in clear text
Iterator<String> j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first)
MetadataRowContainer urlref;
URLMetadataRow urlref;
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || count > s.size()) count = s.size();
statsDump = new ArrayList<hostStat>();
URLMetadata comps;
URLMetadataRow.Components comps;
yacyURL url;
while (j.hasNext()) {
urlhash = j.next();

View File

@ -30,16 +30,6 @@ import de.anomic.kelondro.order.Bitfield;
public interface Reference {
// appearance flags, used in RWI entry
// some names are derived from the Dublin Core Metadata tag set
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part
public static final int flag_app_dc_creator = 26; // word appears in author
public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part
public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
public String toPropertyForm();
public String urlHash();

View File

@ -37,57 +37,55 @@ import java.util.TreeMap;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ByteBuffer;
/**
* A ReferenceContainer is a set of ReferenceRows entries. Since ReferenceRow entries are special
* Row entries, a collection of ReferenceRows can be contained in a RowSet. This class extends
* the RowSet with methods for the handling of special ReferenceRow Row entry objects.
* A ReferenceContainer is a set of ReferenceRows entries for a specific term.
* Since ReferenceRow entries are special Row entries, a collection of ReferenceRows
* can be contained in a RowSet.
* This class extends the RowSet with methods for the handling of
* special ReferenceRow Row entry objects.
*/
public class ReferenceContainer extends RowSet {
private String wordHash;
private String termHash;
public ReferenceContainer(final String wordHash, final RowSet collection) {
public ReferenceContainer(final String termHash, final RowSet collection) {
super(collection);
this.wordHash = wordHash;
this.termHash = termHash;
}
public ReferenceContainer(String wordHash, ReferenceRow entry) {
super(ReferenceRow.urlEntryRow, 1);
this.add(entry);
this.wordHash = wordHash;
}
public ReferenceContainer(final String wordHash, final Row rowdef, final int objectCount) {
public ReferenceContainer(final String termHash, final Row rowdef, final int objectCount) {
super(rowdef, objectCount);
this.wordHash = wordHash;
this.termHash = termHash;
this.lastTimeWrote = 0;
}
public ReferenceContainer topLevelClone() {
final ReferenceContainer newContainer = new ReferenceContainer(this.wordHash, this.rowdef, this.size());
final ReferenceContainer newContainer = new ReferenceContainer(this.termHash, this.rowdef, this.size());
newContainer.addAllUnique(this);
return newContainer;
}
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
return new ReferenceContainer(wordHash, WordReferenceRow.urlEntryRow, elementCount);
}
public void setWordHash(final String newWordHash) {
this.wordHash = newWordHash;
this.termHash = newWordHash;
}
public long updated() {
return super.lastWrote();
}
public String getWordHash() {
return wordHash;
public String getTermHash() {
return termHash;
}
public void add(final ReferenceRow entry) {
public void add(final WordReferenceRow entry) {
// add without double-occurrence test
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
this.addUnique(entry.toKelondroEntry());
@ -95,11 +93,11 @@ public class ReferenceContainer extends RowSet {
public void add(final Reference entry, final long updateTime) {
// add without double-occurrence test
if (entry instanceof ReferenceRow) {
assert ((ReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize;
this.add((ReferenceRow) entry);
if (entry instanceof WordReferenceRow) {
assert ((WordReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize;
this.add((WordReferenceRow) entry);
} else {
this.add(((ReferenceVars) entry).toRowEntry());
this.add(((WordReferenceVars) entry).toRowEntry());
}
this.lastTimeWrote = updateTime;
}
@ -120,24 +118,24 @@ public class ReferenceContainer extends RowSet {
}
public ReferenceContainer merge(final ReferenceContainer c) {
return new ReferenceContainer(this.wordHash, super.merge(c));
return new ReferenceContainer(this.termHash, super.merge(c));
}
public Reference put(final ReferenceRow entry) {
public Reference put(final WordReferenceRow entry) {
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
final Row.Entry r = super.replace(entry.toKelondroEntry());
if (r == null) return null;
return new ReferenceRow(r);
return new WordReferenceRow(r);
}
public boolean putRecent(final ReferenceRow entry) {
public boolean putRecent(final WordReferenceRow entry) {
assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize;
// returns true if the new entry was added, false if it already existed
final Row.Entry oldEntryRow = this.replace(entry.toKelondroEntry());
if (oldEntryRow == null) {
return true;
}
final ReferenceRow oldEntry = new ReferenceRow(oldEntryRow);
final WordReferenceRow oldEntry = new WordReferenceRow(oldEntryRow);
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
this.replace(oldEntry.toKelondroEntry()); // put it back
return false;
@ -151,7 +149,7 @@ public class ReferenceContainer extends RowSet {
if (c == null) return 0;
int x = 0;
synchronized (c) {
final Iterator<ReferenceRow> i = c.entries();
final Iterator<WordReferenceRow> i = c.entries();
while (i.hasNext()) {
try {
if (putRecent(i.next())) x++;
@ -167,7 +165,7 @@ public class ReferenceContainer extends RowSet {
public Reference get(final String urlHash) {
final Row.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null;
return new ReferenceRow(entry);
return new WordReferenceRow(entry);
}
/**
@ -178,7 +176,7 @@ public class ReferenceContainer extends RowSet {
public Reference remove(final String urlHash) {
final Row.Entry entry = remove(urlHash.getBytes());
if (entry == null) return null;
return new ReferenceRow(entry);
return new WordReferenceRow(entry);
}
public int removeEntries(final Set<String> urlHashes) {
@ -188,12 +186,12 @@ public class ReferenceContainer extends RowSet {
return count;
}
public Iterator<ReferenceRow> entries() {
public Iterator<WordReferenceRow> entries() {
// returns an iterator of indexRWIEntry objects
return new entryIterator();
}
public class entryIterator implements Iterator<ReferenceRow> {
public class entryIterator implements Iterator<WordReferenceRow> {
Iterator<Row.Entry> rowEntryIterator;
@ -205,10 +203,10 @@ public class ReferenceContainer extends RowSet {
return rowEntryIterator.hasNext();
}
public ReferenceRow next() {
public WordReferenceRow next() {
final Row.Entry rentry = rowEntryIterator.next();
if (rentry == null) return null;
return new ReferenceRow(rentry);
return new WordReferenceRow(rentry);
}
public void remove() {
@ -342,11 +340,11 @@ public class ReferenceContainer extends RowSet {
final int keylength = small.rowdef.width(0);
assert (keylength == large.rowdef.width(0));
final ReferenceContainer conj = new ReferenceContainer(null, small.rowdef, 0); // start with empty search result
final Iterator<ReferenceRow> se = small.entries();
ReferenceVars ie0;
final Iterator<WordReferenceRow> se = small.entries();
WordReferenceVars ie0;
Reference ie1;
while (se.hasNext()) {
ie0 = new ReferenceVars(se.next());
ie0 = new WordReferenceVars(se.next());
ie1 = large.get(ie0.urlHash());
if ((ie0 != null) && (ie1 != null)) {
assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash();
@ -366,13 +364,13 @@ public class ReferenceContainer extends RowSet {
assert (keylength == i2.rowdef.width(0));
final ReferenceContainer conj = new ReferenceContainer(null, i1.rowdef, 0); // start with empty search result
if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef.getOrdering().signature())))) return conj; // ordering must be equal
final Iterator<ReferenceRow> e1 = i1.entries();
final Iterator<ReferenceRow> e2 = i2.entries();
final Iterator<WordReferenceRow> e1 = i1.entries();
final Iterator<WordReferenceRow> e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
ReferenceVars ie1;
WordReferenceVars ie1;
Reference ie2;
ie1 = new ReferenceVars(e1.next());
ie1 = new WordReferenceVars(e1.next());
ie2 = e2.next();
while (true) {
@ -381,14 +379,14 @@ public class ReferenceContainer extends RowSet {
c = i1.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.join(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1.toRowEntry());
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
if (e2.hasNext()) ie2 = e2.next(); else break;
}
}
@ -420,7 +418,7 @@ public class ReferenceContainer extends RowSet {
final int keylength = pivot.rowdef.width(0);
assert (keylength == excl.rowdef.width(0));
final boolean iterate_pivot = pivot.size() < excl.size();
final Iterator<ReferenceRow> se = (iterate_pivot) ? pivot.entries() : excl.entries();
final Iterator<WordReferenceRow> se = (iterate_pivot) ? pivot.entries() : excl.entries();
Reference ie0, ie1;
while (se.hasNext()) {
ie0 = se.next();
@ -439,13 +437,13 @@ public class ReferenceContainer extends RowSet {
final int keylength = pivot.rowdef.width(0);
assert (keylength == excl.rowdef.width(0));
if (!((pivot.rowdef.getOrdering().signature().equals(excl.rowdef.getOrdering().signature())))) return pivot; // ordering must be equal
final Iterator<ReferenceRow> e1 = pivot.entries();
final Iterator<ReferenceRow> e2 = excl.entries();
final Iterator<WordReferenceRow> e1 = pivot.entries();
final Iterator<WordReferenceRow> e2 = excl.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
ReferenceVars ie1;
WordReferenceVars ie1;
Reference ie2;
ie1 = new ReferenceVars(e1.next());
ie1 = new WordReferenceVars(e1.next());
ie2 = e2.next();
while (true) {
@ -454,14 +452,14 @@ public class ReferenceContainer extends RowSet {
c = pivot.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) {
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.join(ie2);
e1.remove();
if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break;
if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break;
if (e2.hasNext()) ie2 = e2.next(); else break;
}
}
@ -470,11 +468,11 @@ public class ReferenceContainer extends RowSet {
}
public String toString() {
return "C[" + wordHash + "] has " + this.size() + " entries";
return "C[" + termHash + "] has " + this.size() + " entries";
}
public int hashCode() {
return (int) Base64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
return (int) Base64Order.enhancedCoder.decodeLong(this.termHash.substring(0, 4));
}
@ -483,7 +481,7 @@ public class ReferenceContainer extends RowSet {
final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
final TreeMap<String, String> doms = new TreeMap<String, String>();
synchronized (inputContainer) {
final Iterator<ReferenceRow> i = inputContainer.entries();
final Iterator<WordReferenceRow> i = inputContainer.entries();
Reference iEntry;
String dom, paths;
while (i.hasNext()) {

View File

@ -56,7 +56,7 @@ public final class ReferenceContainerArray {
*/
public ReferenceContainerArray(
final File heapLocation,
final ByteOrder wordOrder,
final ByteOrder termOrder,
final Row payloadrow,
IODispatcher merger) throws IOException {
this.payloadrow = payloadrow;
@ -64,7 +64,7 @@ public final class ReferenceContainerArray {
heapLocation,
"index",
payloadrow.primaryKeyLength,
wordOrder,
termOrder,
0);
assert merger != null;
this.merger = merger;
@ -182,8 +182,8 @@ public final class ReferenceContainerArray {
* @return true, if the key is used in the heap; false othervise
* @throws IOException
*/
public synchronized boolean has(final String key) {
return this.array.has(key.getBytes());
public synchronized boolean has(final String termHash) {
return this.array.has(termHash.getBytes());
}
/**
@ -192,13 +192,13 @@ public final class ReferenceContainerArray {
* @return the indexContainer if one exist, null otherwise
* @throws IOException
*/
public synchronized ReferenceContainer get(final String key) throws IOException {
List<byte[]> entries = this.array.getAll(key.getBytes());
public synchronized ReferenceContainer get(final String termHash) throws IOException {
List<byte[]> entries = this.array.getAll(termHash.getBytes());
if (entries == null || entries.size() == 0) return null;
byte[] a = entries.remove(0);
ReferenceContainer c = new ReferenceContainer(key, RowSet.importRowSet(a, payloadrow));
ReferenceContainer c = new ReferenceContainer(termHash, RowSet.importRowSet(a, payloadrow));
while (entries.size() > 0) {
c = c.merge(new ReferenceContainer(key, RowSet.importRowSet(entries.remove(0), payloadrow)));
c = c.merge(new ReferenceContainer(termHash, RowSet.importRowSet(entries.remove(0), payloadrow)));
}
return c;
}
@ -209,13 +209,13 @@ public final class ReferenceContainerArray {
* @return the indexContainer if the cache contained the container, null othervise
* @throws IOException
*/
public synchronized void delete(final String wordHash) throws IOException {
public synchronized void delete(final String termHash) throws IOException {
// returns the index that had been deleted
array.remove(wordHash.getBytes());
array.remove(termHash.getBytes());
}
public synchronized int replace(final String wordHash, ContainerRewriter rewriter) throws IOException {
return array.replace(wordHash.getBytes(), new BLOBRewriter(wordHash, rewriter));
public synchronized int replace(final String termHash, ContainerRewriter rewriter) throws IOException {
return array.replace(termHash.getBytes(), new BLOBRewriter(termHash, rewriter));
}
public class BLOBRewriter implements BLOB.Rewriter {

View File

@ -41,6 +41,7 @@ import de.anomic.kelondro.blob.HeapWriter;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.index.Row;
@ -49,7 +50,7 @@ import de.anomic.kelondro.index.RowSet;
public final class ReferenceContainerCache extends AbstractIndex implements Index, IndexReader, Iterable<ReferenceContainer> {
private final Row payloadrow;
private final ByteOrder wordOrder;
private final ByteOrder termOrder;
private SortedMap<String, ReferenceContainer> cache;
/**
@ -59,9 +60,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
* @param payloadrow
* @param log
*/
public ReferenceContainerCache(final Row payloadrow, ByteOrder wordOrder) {
public ReferenceContainerCache(final Row payloadrow, ByteOrder termOrder) {
this.payloadrow = payloadrow;
this.wordOrder = wordOrder;
this.termOrder = termOrder;
this.cache = null;
}
@ -83,7 +84,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
* another dump reading afterwards is not possible
*/
public void initWriteMode() {
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.wordOrder)));
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.termOrder)));
}
/**
@ -94,14 +95,14 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
public void initWriteModeFromBLOB(final File blobFile) throws IOException {
Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'");
final long start = System.currentTimeMillis();
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.wordOrder)));
this.cache = Collections.synchronizedSortedMap(new TreeMap<String, ReferenceContainer>(new ByteOrder.StringOrder(this.termOrder)));
int urlCount = 0;
synchronized (cache) {
for (final ReferenceContainer container : new blobFileEntries(blobFile, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break;
//System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted());
cache.put(container.getWordHash(), container);
cache.put(container.getTermHash(), container);
urlCount += container.size();
}
}
@ -242,7 +243,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
for (ReferenceContainer container : cache.values()) {
if (container.size() > max) {
max = container.size();
hash = container.getWordHash();
hash = container.getTermHash();
}
}
return hash;
@ -253,7 +254,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
ArrayList<String> hashes = new ArrayList<String>();
for (ReferenceContainer container : cache.values()) {
if (container.size() >= bound) {
hashes.add(container.getWordHash());
hashes.add(container.getTermHash());
}
}
return hashes;
@ -281,7 +282,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
ArrayList<String> hashes = new ArrayList<String>();
long limit = System.currentTimeMillis() - maxage;
for (ReferenceContainer container : cache.values()) {
if (container.lastWrote() < limit) hashes.add(container.getWordHash());
if (container.lastWrote() < limit) hashes.add(container.getTermHash());
}
return hashes;
}
@ -372,9 +373,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
ReferenceContainer c = this.cache.get(key);
if (c == null) return null;
// because this is all in RAM, we must clone the entries (flat)
ReferenceContainer c1 = new ReferenceContainer(c.getWordHash(), c.row(), c.size());
Iterator<ReferenceRow> e = c.entries();
ReferenceRow ee;
ReferenceContainer c1 = new ReferenceContainer(c.getTermHash(), c.row(), c.size());
Iterator<WordReferenceRow> e = c.entries();
WordReferenceRow ee;
while (e.hasNext()) {
ee = e.next();
if (urlselection.contains(ee.urlHash())) c1.add(ee);
@ -441,7 +442,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
if (this.cache == null || container == null || container.size() == 0) return;
// put new words into cache
final String wordHash = container.getWordHash();
final String wordHash = container.getTermHash();
ReferenceContainer entries = cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
int added = 0;
if (entries == null) {
@ -457,7 +458,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
return;
}
public synchronized void add(final String wordHash, final ReferenceRow newEntry) {
public synchronized void add(final String wordHash, final WordReferenceRow newEntry) {
assert this.cache != null;
ReferenceContainer container = cache.get(wordHash);
if (container == null) container = new ReferenceContainer(wordHash, this.payloadrow, 1);
@ -470,7 +471,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde
}
public ByteOrder ordering() {
return this.wordOrder;
return this.termOrder;
}
}

View File

@ -38,7 +38,7 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
}
public boolean wellformed(final ReferenceContainer a) {
return embeddedOrder.wellformed(a.getWordHash().getBytes());
return embeddedOrder.wellformed(a.getTermHash().getBytes());
}
public void direction(final boolean ascending) {
@ -50,15 +50,15 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
}
public int compare(final ReferenceContainer a, final ReferenceContainer b) {
return this.embeddedOrder.compare(a.getWordHash().getBytes(), b.getWordHash().getBytes());
return this.embeddedOrder.compare(a.getTermHash().getBytes(), b.getTermHash().getBytes());
}
public boolean equal(ReferenceContainer a, ReferenceContainer b) {
return this.embeddedOrder.equal(a.getWordHash().getBytes(), b.getWordHash().getBytes());
return this.embeddedOrder.equal(a.getTermHash().getBytes(), b.getTermHash().getBytes());
}
public void rotate(final ReferenceContainer zero) {
this.embeddedOrder.rotate(zero.getWordHash().getBytes());
this.embeddedOrder.rotate(zero.getTermHash().getBytes());
this.zero = new ReferenceContainer(new String(this.embeddedOrder.zero()), zero);
}
@ -80,7 +80,7 @@ public class ReferenceContainerOrder extends AbstractOrder<ReferenceContainer> i
}
public long cardinal(final ReferenceContainer key) {
return this.embeddedOrder.cardinal(key.getWordHash().getBytes());
return this.embeddedOrder.cardinal(key.getTermHash().getBytes());
}
}

View File

@ -32,15 +32,17 @@ import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverProcessor;
import de.anomic.yacy.yacyURL;
public class ReferenceOrder {
private ReferenceVars min, max;
private WordReferenceVars min, max;
private final plasmaSearchRankingProfile ranking;
private final ScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount;
@ -55,10 +57,10 @@ public class ReferenceOrder {
this.language = language;
}
public ArrayList<ReferenceVars> normalizeWith(final ReferenceContainer container) {
public ArrayList<WordReferenceVars> normalizeWith(final ReferenceContainer container) {
// normalize ranking: find minimum and maxiumum of separate ranking criteria
assert (container != null);
ArrayList<ReferenceVars> result = null;
ArrayList<WordReferenceVars> result = null;
//long s0 = System.currentTimeMillis();
if ((serverProcessor.useCPU > 1) && (container.size() > 600)) {
@ -112,7 +114,7 @@ public class ReferenceOrder {
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
}
public long cardinal(final ReferenceVars t) {
public long cardinal(final WordReferenceVars t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
@ -136,17 +138,17 @@ public class ReferenceOrder {
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
+ ((flags.get(Reference.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
+ ((flags.get(Reference.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
+ ((flags.get(Reference.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
+ ((flags.get(Reference.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
+ ((flags.get(Reference.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
+ ((flags.get(Reference.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
+ ((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
+ ((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
+ ((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
+ ((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
+ ((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
+ ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0)
+ ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
@ -161,13 +163,13 @@ public class ReferenceOrder {
public static class minmaxfinder extends Thread {
ReferenceVars entryMin;
ReferenceVars entryMax;
WordReferenceVars entryMin;
WordReferenceVars entryMax;
private final ReferenceContainer container;
private final int start, end;
private final HashMap<String, Integer> doms;
private final Integer int1;
ArrayList<ReferenceVars> decodedEntries;
ArrayList<WordReferenceVars> decodedEntries;
public minmaxfinder(final ReferenceContainer container, final int start /*including*/, final int end /*excluding*/) {
this.container = container;
@ -175,19 +177,19 @@ public class ReferenceOrder {
this.end = end;
this.doms = new HashMap<String, Integer>();
this.int1 = 1;
this.decodedEntries = new ArrayList<ReferenceVars>();
this.decodedEntries = new ArrayList<WordReferenceVars>();
}
public void run() {
// find min/max to obtain limits for normalization
this.entryMin = null;
this.entryMax = null;
ReferenceVars iEntry;
WordReferenceVars iEntry;
int p = this.start;
String dom;
Integer count;
while (p < this.end) {
iEntry = new ReferenceVars(new ReferenceRow(container.get(p++, false)));
iEntry = new WordReferenceVars(new WordReferenceRow(container.get(p++, false)));
this.decodedEntries.add(iEntry);
// find min/max
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
@ -203,7 +205,7 @@ public class ReferenceOrder {
}
}
public ArrayList<ReferenceVars> decodedContainer() {
public ArrayList<WordReferenceVars> decodedContainer() {
return this.decodedEntries;
}

View File

@ -1,61 +0,0 @@
// URLMetadata.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 02.03.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.net.MalformedURLException;
import de.anomic.yacy.yacyURL;
public class URLMetadata {
private yacyURL url;
private final String dc_title, dc_creator, dc_subject, ETag;
public URLMetadata(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) {
try {
this.url = new yacyURL(url, urlhash);
} catch (final MalformedURLException e) {
this.url = null;
}
this.dc_title = title;
this.dc_creator = author;
this.dc_subject = tags;
this.ETag = ETag;
}
public URLMetadata(final yacyURL url, final String descr, final String author, final String tags, final String ETag) {
this.url = url;
this.dc_title = descr;
this.dc_creator = author;
this.dc_subject = tags;
this.ETag = ETag;
}
public yacyURL url() { return this.url; }
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_subject() { return this.dc_subject; }
public String ETag() { return this.ETag; }
}

View File

@ -4,9 +4,9 @@
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.kelondro.text.metadataPrototype;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
@ -39,6 +39,9 @@ import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Metadata;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.kelondroException;
@ -48,7 +51,7 @@ import de.anomic.server.serverCodings;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyURL;
public class MetadataRowContainer {
public class URLMetadataRow implements Metadata {
// this object stores attributes for URL entries
@ -119,7 +122,7 @@ public class MetadataRowContainer {
private Reference word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
public MetadataRowContainer(
public URLMetadataRow(
final yacyURL url,
final String dc_title,
final String dc_creator,
@ -198,14 +201,14 @@ public class MetadataRowContainer {
}
}
public MetadataRowContainer(final Row.Entry entry, final Reference searchedWord, final long ranking) {
public URLMetadataRow(final Row.Entry entry, final Reference searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
this.ranking = ranking;
}
public MetadataRowContainer(final Properties prop) {
public URLMetadataRow(final Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -264,17 +267,17 @@ public class MetadataRowContainer {
this.word = null;
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
if (prop.containsKey("wi")) {
this.word = new ReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()"));
this.word = new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()"));
}
this.ranking = 0;
}
public static MetadataRowContainer importEntry(final String propStr) {
public static URLMetadataRow importEntry(final String propStr) {
if (propStr == null || !propStr.startsWith("{") || !propStr.endsWith("}")) {
return null;
}
try {
return new MetadataRowContainer(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
return new URLMetadataRow(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} catch (final kelondroException e) {
// wrong format
return null;
@ -283,7 +286,7 @@ public class MetadataRowContainer {
private StringBuilder corePropList() {
// generate a parseable string; this is a simple property-list
final URLMetadata metadata = this.metadata();
final Components metadata = this.metadata();
final StringBuilder s = new StringBuilder(300);
//System.out.println("author=" + comp.author());
try {
@ -341,9 +344,9 @@ public class MetadataRowContainer {
return this.ranking;
}
public URLMetadata metadata() {
public Components metadata() {
final ArrayList<String> cl = FileUtils.strings(this.entry.getCol("comp", null), "UTF-8");
return new URLMetadata(
return new Components(
(cl.size() > 0) ? (cl.get(0)).trim() : "",
hash(),
(cl.size() > 1) ? (cl.get(1)).trim() : "",
@ -428,7 +431,7 @@ public class MetadataRowContainer {
return word;
}
public boolean isOlder(final MetadataRowContainer other) {
public boolean isOlder(final Metadata other) {
if (other == null) return false;
final Date tmoddate = moddate();
final Date omoddate = other.moddate();
@ -487,4 +490,33 @@ public class MetadataRowContainer {
//return "{" + core + "}";
}
public class Components {
private yacyURL url;
private final String dc_title, dc_creator, dc_subject, ETag;
public Components(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) {
try {
this.url = new yacyURL(url, urlhash);
} catch (final MalformedURLException e) {
this.url = null;
}
this.dc_title = title;
this.dc_creator = author;
this.dc_subject = tags;
this.ETag = ETag;
}
public Components(final yacyURL url, final String descr, final String author, final String tags, final String ETag) {
this.url = url;
this.dc_title = descr;
this.dc_creator = author;
this.dc_subject = tags;
this.ETag = ETag;
}
public yacyURL url() { return this.url; }
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_subject() { return this.dc_subject; }
public String ETag() { return this.ETag; }
}
}

View File

@ -4,9 +4,9 @@
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.kelondro.text.referencePrototype;
import de.anomic.kelondro.index.Column;
import de.anomic.kelondro.index.Row;
@ -32,9 +32,10 @@ import de.anomic.kelondro.index.Row.Entry;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.Reference;
import de.anomic.yacy.yacySeedDB;
public final class ReferenceRow implements Reference, Cloneable {
public final class WordReferenceRow implements Reference, Cloneable {
// this object stores attributes to URL references inside RWI collections
@ -88,9 +89,19 @@ public final class ReferenceRow implements Reference, Cloneable {
private static final int col_reserve1 = 18; // i 1 reserve1
private static final int col_reserve2 = 19; // k 1 reserve2
// appearance flags, used in RWI entry
// some names are derived from the Dublin Core Metadata tag set
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part
public static final int flag_app_dc_creator = 26; // word appears in author
public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part
public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
private final Row.Entry entry;
public ReferenceRow(final String urlHash,
public WordReferenceRow(final String urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
@ -135,32 +146,32 @@ public final class ReferenceRow implements Reference, Cloneable {
this.entry.setCol(col_reserve2, 0);
}
public ReferenceRow(final String urlHash, final String code) {
public WordReferenceRow(final String urlHash, final String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
}
public ReferenceRow(final String external) {
public WordReferenceRow(final String external) {
this.entry = urlEntryRow.newEntry(external, true);
}
public ReferenceRow(final byte[] row) {
public WordReferenceRow(final byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}
public ReferenceRow(final byte[] row, final int offset, final boolean clone) {
public WordReferenceRow(final byte[] row, final int offset, final boolean clone) {
this.entry = urlEntryRow.newEntry(row, offset, clone);
}
public ReferenceRow(final Row.Entry rentry) {
public WordReferenceRow(final Row.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public ReferenceRow clone() {
public WordReferenceRow clone() {
final byte[] b = new byte[urlEntryRow.objectsize];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize);
return new ReferenceRow(b);
return new WordReferenceRow(b);
}
public String toPropertyForm() {

View File

@ -4,9 +4,9 @@
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
@ -24,12 +24,13 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.kelondro.text.referencePrototype;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.text.Reference;
public class ReferenceVars implements Reference, Cloneable {
public class WordReferenceVars implements Reference, Cloneable {
public Bitfield flags;
public long freshUntil, lastModified;
@ -41,7 +42,7 @@ public class ReferenceVars implements Reference, Cloneable {
worddistance, wordsintext, wordsintitle;
public double termFrequency;
public ReferenceVars(final String urlHash,
public WordReferenceVars(final String urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
@ -86,7 +87,7 @@ public class ReferenceVars implements Reference, Cloneable {
this.termFrequency = termfrequency;
}
public ReferenceVars(final ReferenceRow e) {
public WordReferenceVars(final WordReferenceRow e) {
this.flags = e.flags();
this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified();
@ -109,8 +110,8 @@ public class ReferenceVars implements Reference, Cloneable {
this.termFrequency = e.termFrequency();
}
public ReferenceVars clone() {
final ReferenceVars c = new ReferenceVars(
public WordReferenceVars clone() {
final WordReferenceVars c = new WordReferenceVars(
this.urlHash,
this.urllength,
this.urlcomps,
@ -133,7 +134,7 @@ public class ReferenceVars implements Reference, Cloneable {
return c;
}
public void join(final ReferenceVars oe) {
public void join(final WordReferenceVars oe) {
// combine the distance
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
this.posintext = Math.min(this.posintext, oe.posintext);
@ -203,8 +204,8 @@ public class ReferenceVars implements Reference, Cloneable {
return posofphrase;
}
public ReferenceRow toRowEntry() {
return new ReferenceRow(
public WordReferenceRow toRowEntry() {
return new WordReferenceRow(
urlHash,
urllength, // byte-length of complete URL
urlcomps, // number of path components
@ -262,7 +263,7 @@ public class ReferenceVars implements Reference, Cloneable {
return this.termFrequency;
}
public final void min(final ReferenceVars other) {
public final void min(final WordReferenceVars other) {
int v;
long w;
double d;
@ -284,7 +285,7 @@ public class ReferenceVars implements Reference, Cloneable {
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
}
public final void max(final ReferenceVars other) {
public final void max(final WordReferenceVars other) {
int v;
long w;
double d;

View File

@ -23,7 +23,7 @@
// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
// execute with java -cp source de.anomic.plasma.plasmaCondenser
package de.anomic.plasma;
package de.anomic.plasma.parser;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
@ -49,14 +49,13 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Phrase;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.SetTools;
import de.anomic.language.identification.Identificator;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.yacy.yacyURL;
public final class plasmaCondenser {
public final class Condenser {
// this is the page analysis class
@ -101,7 +100,7 @@ public final class plasmaCondenser {
public Bitfield RESULT_FLAGS = new Bitfield(4);
Identificator languageIdentificator;
public plasmaCondenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException {
public Condenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.wordminsize = 3;
@ -133,13 +132,13 @@ public final class plasmaCondenser {
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.dc_title(), 1, Reference.flag_app_dc_title, RESULT_FLAGS, true);
insertTextToWords(document.dc_description(), 3, Reference.flag_app_dc_description, RESULT_FLAGS, true);
insertTextToWords(document.dc_creator(), 4, Reference.flag_app_dc_creator, RESULT_FLAGS, true);
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, Reference.flag_app_emphasized, RESULT_FLAGS, true);
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true);
}
// anchors: for text indexing we add only the anchor description
@ -164,7 +163,7 @@ public final class plasmaCondenser {
}
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, Reference.flag_app_dc_identifier, RESULT_FLAGS, false);
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
@ -241,11 +240,11 @@ public final class plasmaCondenser {
}
}
public plasmaCondenser(final InputStream text, final String charset) throws UnsupportedEncodingException {
public Condenser(final InputStream text, final String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
public plasmaCondenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException {
public Condenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException {
this.wordminsize = wordminsize;
this.wordcut = wordcut;
this.languageIdentificator = null; // we don't need that here
@ -715,7 +714,7 @@ public final class plasmaCondenser {
buffer = new ByteArrayInputStream(text.getBytes());
}
try {
return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
return new Condenser(buffer, "UTF-8", 2, 1).words();
} catch (final UnsupportedEncodingException e) {
return null;
}

View File

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.plasma.parser;
import java.util.Date;

View File

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.plasma.parser;
import java.util.HashSet;

View File

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
package de.anomic.plasma.parser;
import java.util.HashSet;
import java.util.Iterator;

View File

@ -6,10 +6,10 @@ import java.util.TreeSet;
import de.anomic.crawler.AbstractImporter;
import de.anomic.crawler.Importer;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
public class plasmaDbImporter extends AbstractImporter implements Importer {
@ -109,11 +109,11 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
try {
this.wordCounter++;
newContainer = indexContainerIterator.next();
this.wordHash = newContainer.getWordHash();
this.wordHash = newContainer.getTermHash();
// loop throug the entities of the container and get the
// urlhash
final Iterator<ReferenceRow> importWordIdxEntries = newContainer.entries();
final Iterator<WordReferenceRow> importWordIdxEntries = newContainer.entries();
Reference importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
@ -141,7 +141,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
// we need to import the url
// getting the url entry
final MetadataRowContainer urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0);
final URLMetadataRow urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0);
if (urlEntry != null) {
/* write it into the home url db */
@ -206,7 +206,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
final TreeSet<ReferenceContainer> containers = this.importWordIndex.index().references(this.wordHash, false, 100, false);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) {
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
indexContainerIterator = containers.iterator();
}
}

View File

@ -48,9 +48,9 @@ import de.anomic.kelondro.blob.BLOBCompressor;
import de.anomic.kelondro.blob.BLOBHeap;
import de.anomic.kelondro.blob.MapView;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Document;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;

View File

@ -43,6 +43,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.yacy.yacyURL;
@ -282,7 +283,7 @@ dc_rights
public Iterator<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null;
final plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText());
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
e.pre(pre);
return e;
}
@ -439,7 +440,7 @@ dc_rights
this.favicon = faviconURL;
}
public void notifyWebStructure(final plasmaWebStructure webStructure, final plasmaCondenser condenser, final Date docDate) {
public void notifyWebStructure(final plasmaWebStructure webStructure, final Condenser condenser, final Date docDate) {
final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
this.inboundLinks = ioLinks[0].intValue();
this.outboundLinks = ioLinks[1].intValue();

View File

@ -388,7 +388,7 @@ public class plasmaRankingCRProcess {
CloneableIterator<Row.Entry> cr_entry;
while (i.hasNext()) {
keycollection = i.next();
referee = keycollection.getWordHash();
referee = keycollection.getTermHash();
if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6);
cr_entry = keycollection.rows();

View File

@ -32,12 +32,13 @@ import java.util.Date;
import java.util.Iterator;
import java.util.List;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverObjects;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
@ -54,17 +55,17 @@ public class plasmaSearchAPI {
if (post.get("flags","").length() == 0) return null;
return new Bitfield(4, post.get("flags"));
}
if (post.get("description", "").equals("on")) b.set(Reference.flag_app_dc_description, true);
if (post.get("title", "").equals("on")) b.set(Reference.flag_app_dc_title, true);
if (post.get("creator", "").equals("on")) b.set(Reference.flag_app_dc_creator, true);
if (post.get("subject", "").equals("on")) b.set(Reference.flag_app_dc_subject, true);
if (post.get("url", "").equals("on")) b.set(Reference.flag_app_dc_identifier, true);
if (post.get("emphasized", "").equals("on")) b.set(Reference.flag_app_emphasized, true);
if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true);
if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true);
if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true);
if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true);
if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true);
if (post.get("description", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_description, true);
if (post.get("title", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_title, true);
if (post.get("creator", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_creator, true);
if (post.get("subject", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_subject, true);
if (post.get("url", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_identifier, true);
if (post.get("emphasized", "").equals("on")) b.set(WordReferenceRow.flag_app_emphasized, true);
if (post.get("image", "").equals("on")) b.set(Condenser.flag_cat_hasimage, true);
if (post.get("audio", "").equals("on")) b.set(Condenser.flag_cat_hasaudio, true);
if (post.get("video", "").equals("on")) b.set(Condenser.flag_cat_hasvideo, true);
if (post.get("app", "").equals("on")) b.set(Condenser.flag_cat_hasapp, true);
if (post.get("indexof", "").equals("on")) b.set(Condenser.flag_cat_indexof, true);
return b;
}
@ -96,17 +97,17 @@ public class plasmaSearchAPI {
} else {
prop.put("searchresult", 3);
prop.put("searchresult_allurl", ranked.filteredCount());
prop.put("searchresult_description", ranked.flagCount()[Reference.flag_app_dc_description]);
prop.put("searchresult_title", ranked.flagCount()[Reference.flag_app_dc_title]);
prop.put("searchresult_creator", ranked.flagCount()[Reference.flag_app_dc_creator]);
prop.put("searchresult_subject", ranked.flagCount()[Reference.flag_app_dc_subject]);
prop.put("searchresult_url", ranked.flagCount()[Reference.flag_app_dc_identifier]);
prop.put("searchresult_emphasized", ranked.flagCount()[Reference.flag_app_emphasized]);
prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
prop.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]);
prop.put("searchresult_title", ranked.flagCount()[WordReferenceRow.flag_app_dc_title]);
prop.put("searchresult_creator", ranked.flagCount()[WordReferenceRow.flag_app_dc_creator]);
prop.put("searchresult_subject", ranked.flagCount()[WordReferenceRow.flag_app_dc_subject]);
prop.put("searchresult_url", ranked.flagCount()[WordReferenceRow.flag_app_dc_identifier]);
prop.put("searchresult_emphasized", ranked.flagCount()[WordReferenceRow.flag_app_emphasized]);
prop.put("searchresult_image", ranked.flagCount()[Condenser.flag_cat_hasimage]);
prop.put("searchresult_audio", ranked.flagCount()[Condenser.flag_cat_hasaudio]);
prop.put("searchresult_video", ranked.flagCount()[Condenser.flag_cat_hasvideo]);
prop.put("searchresult_app", ranked.flagCount()[Condenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", ranked.flagCount()[Condenser.flag_cat_indexof]);
}
return ranked;
}
@ -126,7 +127,7 @@ public class plasmaSearchAPI {
prop.put("genUrlList_lines", maxlines);
int i = 0;
yacyURL url;
MetadataRowContainer entry;
URLMetadataRow entry;
String us;
long rn = -1;
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) {
@ -161,17 +162,17 @@ public class plasmaSearchAPI {
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(Reference.flag_app_dc_identifier)) ? "appears in url, " : "") +
((entry.word().flags().get(Reference.flag_app_dc_title)) ? "appears in title, " : "") +
((entry.word().flags().get(Reference.flag_app_dc_creator)) ? "appears in author, " : "") +
((entry.word().flags().get(Reference.flag_app_dc_subject)) ? "appears in subject, " : "") +
((entry.word().flags().get(Reference.flag_app_dc_description)) ? "appears in description, " : "") +
((entry.word().flags().get(Reference.flag_app_emphasized)) ? "appears emphasized, " : "") +
((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
);
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) {

View File

@ -39,17 +39,17 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.crawler.ResultURLs;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceVars;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.SetTools;
import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.SortStore;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacySearch;
@ -180,7 +180,7 @@ public final class plasmaSearchEvent {
for (Map.Entry<String, ReferenceContainer> entry : this.rankedCache.searchContainerMaps()[0].entrySet()) {
wordhash = entry.getKey();
final ReferenceContainer container = entry.getValue();
assert (container.getWordHash().equals(wordhash));
assert (container.getTermHash().equals(wordhash));
if (container.size() > maxcount) {
IAmaxcounthash = wordhash;
maxcount = container.size();
@ -264,7 +264,7 @@ public final class plasmaSearchEvent {
}
}
ResultEntry obtainResultEntry(final MetadataRowContainer page, final int snippetFetchMode) {
ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
// a search result entry needs some work to produce a result Entry:
// - check if url entry exists in LURL-db
@ -280,7 +280,7 @@ public final class plasmaSearchEvent {
// find the url entry
long startTime = System.currentTimeMillis();
final URLMetadata metadata = page.metadata();
final URLMetadataRow.Components metadata = page.metadata();
final String pagetitle = metadata.dc_title().toLowerCase();
if (metadata.url() == null) {
registerFailure(page.hash(), "url corrupted (null)");
@ -304,7 +304,7 @@ public final class plasmaSearchEvent {
// check constraints
if ((query.constraint != null) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(metadata.dc_title().startsWith("Index of")))) {
final Iterator<String> wi = query.queryHashes.iterator();
while (wi.hasNext()) try { wordIndex.index().remove(wi.next(), page.hash()); } catch (IOException e) {}
@ -337,7 +337,7 @@ public final class plasmaSearchEvent {
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@ -512,7 +512,7 @@ public final class plasmaSearchEvent {
public void run() {
// start fetching urls and snippets
MetadataRowContainer page;
URLMetadataRow page;
final int fetchAhead = snippetMode == 0 ? 0 : 10;
while (System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
@ -803,8 +803,8 @@ public final class plasmaSearchEvent {
public static class ResultEntry {
// payload objects
private final MetadataRowContainer urlentry;
private final URLMetadata urlcomps; // buffer for components
private final URLMetadataRow urlentry;
private final URLMetadataRow.Components urlcomps; // buffer for components
private String alternative_urlstring;
private String alternative_urlname;
private final plasmaSnippetCache.TextSnippet textSnippet;
@ -813,7 +813,7 @@ public final class plasmaSearchEvent {
// statistic objects
public long dbRetrievalTime, snippetComputationTime;
public ResultEntry(final MetadataRowContainer urlentry, final plasmaWordIndex wordIndex,
public ResultEntry(final URLMetadataRow urlentry, final plasmaWordIndex wordIndex,
final plasmaSnippetCache.TextSnippet textSnippet,
final ArrayList<plasmaSnippetCache.MediaSnippet> mediaSnippets,
final long dbRetrievalTime, final long snippetComputationTime) {
@ -837,7 +837,7 @@ public final class plasmaSearchEvent {
// seed is not known from here
try {
wordIndex.index().remove(
Word.words2hashes(plasmaCondenser.getWords(
Word.words2hashes(Condenser.getWords(
("yacyshare " +
filename.replace('?', ' ') +
" " +
@ -899,10 +899,10 @@ public final class plasmaSearchEvent {
public int lapp() {
return urlentry.lapp();
}
public ReferenceVars word() {
public WordReferenceVars word() {
final Reference word = urlentry.word();
assert word instanceof ReferenceVars;
return (ReferenceVars) word;
assert word instanceof WordReferenceVars;
return (WordReferenceVars) word;
}
public boolean hasTextSnippet() {
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);

View File

@ -31,8 +31,9 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.SetTools;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
@ -234,7 +235,7 @@ public final class plasmaSearchQuery {
public static final boolean matches(final String text, final TreeSet<String> keyhashes) {
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
final TreeSet<String> wordhashes = Word.words2hashes(plasmaCondenser.getWords(text).keySet());
final TreeSet<String> wordhashes = Word.words2hashes(Condenser.getWords(text).keySet());
return SetTools.anymatch(wordhashes, keyhashes);
}

View File

@ -39,16 +39,16 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.index.BinSearch;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceOrder;
import de.anomic.kelondro.text.ReferenceVars;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;
@ -59,8 +59,8 @@ public final class plasmaSearchRankingProcess {
private static boolean useYBR = true;
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
private final SortStack<ReferenceVars> stack;
private final HashMap<String, SortStack<ReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final SortStack<WordReferenceVars> stack;
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private final plasmaSearchQuery query;
private final int maxentries;
@ -83,8 +83,8 @@ public final class plasmaSearchRankingProcess {
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null;
this.stack = new SortStack<ReferenceVars>(maxentries);
this.doubleDomCache = new HashMap<String, SortStack<ReferenceVars>>();
this.stack = new SortStack<WordReferenceVars>(maxentries);
this.doubleDomCache = new HashMap<String, SortStack<WordReferenceVars>>();
this.handover = new HashMap<String, String>();
this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang);
this.query = query;
@ -103,7 +103,7 @@ public final class plasmaSearchRankingProcess {
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
}
public long ranking(final ReferenceVars word) {
public long ranking(final WordReferenceVars word) {
return order.cardinal(word);
}
@ -148,13 +148,13 @@ public final class plasmaSearchRankingProcess {
long timer = System.currentTimeMillis();
// normalize entries
final ArrayList<ReferenceVars> decodedEntries = this.order.normalizeWith(index);
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final Iterator<ReferenceVars> i = decodedEntries.iterator();
ReferenceVars iEntry;
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r;
while (i.hasNext()) {
iEntry = i.next();
@ -175,10 +175,10 @@ public final class plasmaSearchRankingProcess {
// check document domain
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}
// check tld domain
@ -252,10 +252,10 @@ public final class plasmaSearchRankingProcess {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private SortStack<ReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
private SortStack<WordReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list
SortStack<ReferenceVars> m;
SortStack<ReferenceVars>.stackElement rwi;
SortStack<WordReferenceVars> m;
SortStack<WordReferenceVars>.stackElement rwi;
while (stack.size() > 0) {
rwi = stack.pop();
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
@ -265,7 +265,7 @@ public final class plasmaSearchRankingProcess {
m = this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
m = new SortStack<ReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
m = new SortStack<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
this.doubleDomCache.put(domhash, m);
return rwi;
}
@ -274,9 +274,9 @@ public final class plasmaSearchRankingProcess {
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
final Iterator<SortStack<ReferenceVars>> i = this.doubleDomCache.values().iterator();
SortStack<ReferenceVars>.stackElement bestEntry = null;
SortStack<ReferenceVars>.stackElement o;
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
SortStack<WordReferenceVars>.stackElement bestEntry = null;
SortStack<WordReferenceVars>.stackElement o;
while (i.hasNext()) {
m = i.next();
if (m == null) continue;
@ -298,15 +298,15 @@ public final class plasmaSearchRankingProcess {
return bestEntry;
}
public MetadataRowContainer bestURL(final boolean skipDoubleDom) {
public URLMetadataRow bestURL(final boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list
while ((stack.size() > 0) || (size() > 0)) {
if (((stack.size() == 0) && (size() == 0))) break;
final SortStack<ReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
final MetadataRowContainer u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadata metadata = u.metadata();
final URLMetadataRow.Components metadata = u.metadata();
if (metadata.url() != null) this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
@ -318,7 +318,7 @@ public final class plasmaSearchRankingProcess {
public int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = stack.size();
final Iterator<SortStack<ReferenceVars>> i = this.doubleDomCache.values().iterator();
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += i.next().size();
return c;
}
@ -355,7 +355,7 @@ public final class plasmaSearchRankingProcess {
}
public Reference remove(final String urlHash) {
final SortStack<ReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
final SortStack<WordReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
if (se == null) return null;
urlhashes.remove(urlHash);
return se.element;

View File

@ -43,13 +43,14 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpClient;
import de.anomic.http.httpResponseHeader;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SetTools;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Document;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacyURL;
@ -302,7 +303,7 @@ public class plasmaSnippetCache {
}
@SuppressWarnings("unchecked")
public static TextSnippet retrieveTextSnippet(final URLMetadata comp, final Set<String> queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) {
public static TextSnippet retrieveTextSnippet(final URLMetadataRow.Components comp, final Set<String> queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final yacyURL url = comp.url();
if (queryhashes.size() == 0) {
@ -796,7 +797,7 @@ public class plasmaSnippetCache {
private static HashMap<String, Integer> hashSentence(final String sentence) {
// generates a word-wordPos mapping
final HashMap<String, Integer> map = new HashMap<String, Integer>();
final Enumeration<StringBuilder> words = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
final Enumeration<StringBuilder> words = Condenser.wordTokenizer(sentence, "UTF-8");
int pos = 0;
StringBuilder word;
String hash;

View File

@ -122,6 +122,7 @@ import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.data.Blacklist;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments;
@ -139,18 +140,17 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdRobotsTxtConfig;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.SetTools;
import de.anomic.net.UPnP;
import de.anomic.plasma.parser.Document;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverBusyThread;
import de.anomic.server.serverCore;
@ -930,7 +930,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (urlhash.length() == 0) return null;
final yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne;
final MetadataRowContainer le = webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow le = webIndex.metadata().load(urlhash, null, 0);
if (le != null) return le.metadata().url();
return null;
}
@ -1242,11 +1242,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public static class indexingQueueEntry extends serverProcessorJob {
public IndexingStack.QueueEntry queueEntry;
public plasmaParserDocument document;
public plasmaCondenser condenser;
public Condenser condenser;
public indexingQueueEntry(
final IndexingStack.QueueEntry queueEntry,
final plasmaParserDocument document,
final plasmaCondenser condenser) {
final Condenser condenser) {
super();
this.queueEntry = queueEntry;
this.document = document;
@ -1595,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// strip out words and generate statistics
if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
try {
plasmaCondenser condenser = new plasmaCondenser(in.document, in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
Condenser condenser = new Condenser(in.document, in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -1623,7 +1623,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
in.queueEntry.close();
}
private void storeDocumentIndex(final IndexingStack.QueueEntry queueEntry, final plasmaParserDocument document, final plasmaCondenser condenser) {
private void storeDocumentIndex(final IndexingStack.QueueEntry queueEntry, final plasmaParserDocument document, final Condenser condenser) {
// CREATE INDEX
final String dc_title = document.dc_title();
@ -1634,7 +1634,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
// STORE URL TO LOADED-URL-DB
MetadataRowContainer newEntry = null;
URLMetadataRow newEntry = null;
try {
newEntry = webIndex.storeDocument(queueEntry, document, condenser);
} catch (final IOException e) {
@ -1682,9 +1682,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public class receiptSending implements Runnable {
yacySeed initiatorPeer;
MetadataRowContainer reference;
URLMetadataRow reference;
public receiptSending(final yacySeed initiatorPeer, final MetadataRowContainer reference) {
public receiptSending(final yacySeed initiatorPeer, final URLMetadataRow reference) {
this.initiatorPeer = initiatorPeer;
this.reference = reference;
}
@ -1729,9 +1729,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (urlhash == null) return 0;
// determine the url string
final MetadataRowContainer entry = webIndex.metadata().load(urlhash, null, 0);
final URLMetadataRow entry = webIndex.metadata().load(urlhash, null, 0);
if (entry == null) return 0;
final URLMetadata metadata = entry.metadata();
final URLMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) return 0;
InputStream resourceContent = null;
@ -1757,7 +1757,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// get the word set
Set<String> words = null;
try {
words = new plasmaCondenser(document, true, true).words().keySet();
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
e.printStackTrace();
}

View File

@ -317,7 +317,7 @@ public final class plasmaSwitchboardConstants {
*
* @see DefaultBlacklist for a detailed overview about the syntax of the default implementation
*/
public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.kelondro.text.DefaultBlacklist";
public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.data.DefaultBlacklist";
public static final String LIST_BLUE = "plasmaBlueList";
public static final String LIST_BLUE_DEFAULT = null;
public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords";

View File

@ -42,6 +42,7 @@ import de.anomic.kelondro.order.MicroDate;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Condenser;
import de.anomic.yacy.yacyURL;
public class plasmaWebStructure {
@ -90,7 +91,7 @@ public class plasmaWebStructure {
}
}
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final plasmaCondenser condenser, final Date docDate) {
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final Condenser condenser, final Date docDate) {
final yacyURL url = document.dc_source();
// generate citation reference

View File

@ -38,6 +38,7 @@ import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.IndexingStack;
import de.anomic.data.Blacklist;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.blob.BLOBArray;
@ -47,16 +48,16 @@ import de.anomic.kelondro.text.BufferedIndex;
import de.anomic.kelondro.text.BufferedIndexCollection;
import de.anomic.kelondro.text.IndexCell;
import de.anomic.kelondro.text.IndexCollectionMigration;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.IODispatcher;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.kelondroException;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
@ -146,7 +147,7 @@ public final class plasmaWordIndex {
new IndexCollectionMigration(
indexPrimaryTextLocation,
wordOrder,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
@ -156,7 +157,7 @@ public final class plasmaWordIndex {
new BufferedIndexCollection(
indexPrimaryTextLocation,
wordOrder,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
useCommons,
redundancy,
@ -167,7 +168,7 @@ public final class plasmaWordIndex {
this.index = new IndexCell(
new File(indexPrimaryTextLocation, "RICELL"),
wordOrder,
ReferenceRow.urlEntryRow,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
@ -408,7 +409,7 @@ public final class plasmaWordIndex {
* @param outlinksOther
* @return
*/
public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final plasmaCondenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
int wordCount = 0;
final int urlLength = url.toNormalform(true, true).length();
final int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
@ -417,14 +418,14 @@ public final class plasmaWordIndex {
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
Map.Entry<String, Word> wentry;
String word;
ReferenceRow ientry;
WordReferenceRow ientry;
Word wprop;
while (i.hasNext()) {
wentry = i.next();
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry = new ReferenceRow(url.hash(),
ientry = new WordReferenceRow(url.hash(),
urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(),
wprop.count,
condenser.RESULT_NUMB_WORDS,
@ -458,7 +459,7 @@ public final class plasmaWordIndex {
queuePreStack.close();
}
public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException {
public URLMetadataRow storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final Condenser condenser) throws IOException {
final long startTime = System.currentTimeMillis();
// CREATE INDEX
@ -511,7 +512,7 @@ public final class plasmaWordIndex {
// create a new loaded URL db entry
final long ldate = System.currentTimeMillis();
final MetadataRowContainer newEntry = new MetadataRowContainer(
final URLMetadataRow newEntry = new URLMetadataRow(
entry.url(), // URL
dc_title, // document description
document.dc_creator(), // author
@ -649,7 +650,7 @@ public final class plasmaWordIndex {
public void run() {
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
ReferenceContainer container = null;
ReferenceRow entry = null;
WordReferenceRow entry = null;
yacyURL url = null;
final HashSet<String> urlHashs = new HashSet<String>();
try {
@ -657,14 +658,14 @@ public final class plasmaWordIndex {
while (indexContainerIterator.hasNext() && run) {
waiter();
container = indexContainerIterator.next();
final Iterator<ReferenceRow> containerIterator = container.entries();
wordHashNow = container.getWordHash();
final Iterator<WordReferenceRow> containerIterator = container.entries();
wordHashNow = container.getTermHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
final MetadataRowContainer ue = metadata.load(entry.urlHash(), entry, 0);
final URLMetadataRow ue = metadata.load(entry.urlHash(), entry, 0);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
@ -675,9 +676,9 @@ public final class plasmaWordIndex {
}
}
if (urlHashs.size() > 0) try {
final int removed = index.remove(container.getWordHash(), urlHashs);
Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash();
final int removed = index.remove(container.getTermHash(), urlHashs);
Log.logFine("INDEXCLEANER", container.getTermHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getTermHash();
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
} catch (IOException e) {
@ -686,10 +687,10 @@ public final class plasmaWordIndex {
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer> containers = index.references(container.getWordHash(), false, 100, false);
final TreeSet<ReferenceContainer> containers = index.references(container.getTermHash(), false, 100, false);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) {
if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) {
indexContainerIterator = containers.iterator();
}
}

View File

@ -35,8 +35,8 @@ import java.util.Map;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.BufferedIndex;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProcessor;
import de.anomic.yacy.yacySeed;
@ -181,7 +181,7 @@ public class Dispatcher {
(System.currentTimeMillis() < timeout) &&
((container = indexContainerIterator.next()) != null) &&
((containers.size() == 0) ||
(Base64Order.enhancedComparator.compare(container.getWordHash(), limitHash) < 0))
(Base64Order.enhancedComparator.compare(container.getTermHash(), limitHash) < 0))
) {
if (container.size() == 0) continue;
@ -190,15 +190,15 @@ public class Dispatcher {
}
// then remove the container from the backend
HashSet<String> urlHashes = new HashSet<String>();
Iterator<ReferenceRow> it;
Iterator<WordReferenceRow> it;
for (ReferenceContainer c: containers) {
urlHashes.clear();
it = c.entries();
while (it.hasNext()) {
urlHashes.add(it.next().urlHash());
}
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getWordHash() + "'");
if (urlHashes.size() > 0) this.backend.remove(c.getWordHash(), urlHashes);
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHash() + "'");
if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes);
}
// finished. The caller must take care of the containers and must put them back if not needed
@ -222,15 +222,15 @@ public class Dispatcher {
// check all entries and split them to the partitions
ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount];
ReferenceRow re;
WordReferenceRow re;
for (ReferenceContainer container: containers) {
// init the new partitions
for (int j = 0; j < partitionBuffer.length; j++) {
partitionBuffer[j] = new ReferenceContainer(container.getWordHash(), container.row(), container.size() / partitionCount);
partitionBuffer[j] = new ReferenceContainer(container.getTermHash(), container.row(), container.size() / partitionCount);
}
// split the container
Iterator<ReferenceRow> i = container.entries();
Iterator<WordReferenceRow> i = container.entries();
while (i.hasNext()) {
re = i.next();
if (re == null) continue;
@ -263,7 +263,7 @@ public class Dispatcher {
for (int vertical = 0; vertical < containers.length; vertical++) {
// the 'new' primary target is the word hash of the last container
lastContainer = containers[vertical].get(containers[vertical].size() - 1);
primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getWordHash(), vertical));
primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getTermHash(), vertical));
// get or make a entry object
entry = this.transmissionCloud.get(primaryTarget); // if this is not null, the entry is extended here

View File

@ -32,11 +32,11 @@ import java.util.Iterator;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerCache;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverProcessorJob;
@ -88,7 +88,7 @@ public class Transmission {
*/
private String primaryTarget;
private ReferenceContainerCache containers;
private HashMap<String, MetadataRowContainer> references;
private HashMap<String, URLMetadataRow> references;
private HashSet<String> badReferences;
private ArrayList<yacySeed> targets;
private int hit, miss;
@ -109,7 +109,7 @@ public class Transmission {
this.primaryTarget = primaryTarget;
this.containers = new ReferenceContainerCache(payloadrow, plasmaWordIndex.wordOrder);
this.containers.initWriteMode();
this.references = new HashMap<String, MetadataRowContainer>();
this.references = new HashMap<String, URLMetadataRow>();
this.badReferences = new HashSet<String>();
this.targets = targets;
this.hit = 0;
@ -123,12 +123,12 @@ public class Transmission {
*/
public void add(ReferenceContainer container) {
// iterate through the entries in the container and check if the reference is in the repository
Iterator<ReferenceRow> i = container.entries();
Iterator<WordReferenceRow> i = container.entries();
ArrayList<String> notFound = new ArrayList<String>();
while (i.hasNext()) {
ReferenceRow e = i.next();
WordReferenceRow e = i.next();
if (references.containsKey(e.urlHash()) || badReferences.contains(e.urlHash())) continue;
MetadataRowContainer r = repository.load(e.urlHash(), null, 0);
URLMetadataRow r = repository.load(e.urlHash(), null, 0);
if (r == null) {
notFound.add(e.urlHash());
badReferences.add(e.urlHash());
@ -204,7 +204,7 @@ public class Transmission {
Iterator<ReferenceContainer> i = this.containers.iterator();
ReferenceContainer firstContainer = (i == null) ? null : i.next();
log.logInfo("Index transfer of " + this.containers.size() +
" words [" + ((firstContainer == null) ? null : firstContainer.getWordHash()) + " .. " + this.primaryTarget + "]" +
" words [" + ((firstContainer == null) ? null : firstContainer.getTermHash()) + " .. " + this.primaryTarget + "]" +
" and " + this.references.size() + " URLs" +
" to peer " + target.getName() + ":" + target.hash +
" in " + (transferTime / 1000) +

View File

@ -60,6 +60,7 @@ import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ResultURLs;
import de.anomic.data.Blacklist;
import de.anomic.http.DefaultCharsetFilePart;
import de.anomic.http.DefaultCharsetStringPart;
import de.anomic.http.httpClient;
@ -69,14 +70,11 @@ import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerCache;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.URLMetadata;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaSearchRankingProcess;
@ -85,6 +83,7 @@ import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.parser.Word;
import de.anomic.server.serverCore;
import de.anomic.server.serverDomains;
import de.anomic.tools.crypt;
@ -533,15 +532,15 @@ public final class yacyClient {
}
// insert results to containers
MetadataRowContainer urlEntry;
URLMetadataRow urlEntry;
final String[] urls = new String[results];
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = MetadataRowContainer.importEntry(result.get("resource" + n));
urlEntry = URLMetadataRow.importEntry(result.get("resource" + n));
if (urlEntry == null) continue;
assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash();
if (urlEntry.hash().length() != 12) continue; // bad url hash
final URLMetadata metadata = urlEntry.metadata();
final URLMetadataRow.Components metadata = urlEntry.metadata();
if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
continue; // block with backlist
@ -796,7 +795,7 @@ public final class yacyClient {
return "wrong protocol: " + protocol;
}
public static HashMap<String, String> crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final MetadataRowContainer entry, final String wordhashes) {
public static HashMap<String, String> crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final URLMetadataRow entry, final String wordhashes) {
assert (target != null);
assert (mySeed != null);
assert (mySeed != target);
@ -859,7 +858,7 @@ public final class yacyClient {
public static String transferIndex(
final yacySeed targetSeed,
final ReferenceContainerCache indexes,
final HashMap<String, MetadataRowContainer> urlCache,
final HashMap<String, URLMetadataRow> urlCache,
final boolean gzipBody,
final int timeout) {
@ -868,7 +867,7 @@ public final class yacyClient {
try {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator<ReferenceRow> eenum;
Iterator<WordReferenceRow> eenum;
Reference entry;
for (ReferenceContainer ic: indexes) {
eenum = ic.entries();
@ -911,7 +910,7 @@ public final class yacyClient {
if (uhs.length == 0) { return null; } // all url's known
// extract the urlCache from the result
final MetadataRowContainer[] urls = new MetadataRowContainer[uhs.length];
final URLMetadataRow[] urls = new URLMetadataRow[uhs.length];
for (int i = 0; i < uhs.length; i++) {
urls[i] = urlCache.get(uhs[i]);
if (urls[i] == null) {
@ -963,13 +962,13 @@ public final class yacyClient {
int indexcount = 0;
final StringBuilder entrypost = new StringBuilder(indexes.size() * 73);
Iterator<ReferenceRow> eenum;
Iterator<WordReferenceRow> eenum;
Reference entry;
for (ReferenceContainer ic: indexes) {
eenum = ic.entries();
while (eenum.hasNext()) {
entry = eenum.next();
entrypost.append(ic.getWordHash())
entrypost.append(ic.getTermHash())
.append(entry.toPropertyForm())
.append(serverCore.CRLF_STRING);
indexcount++;
@ -1001,7 +1000,7 @@ public final class yacyClient {
}
}
private static HashMap<String, String> transferURL(final yacySeed targetSeed, final MetadataRowContainer[] urls, boolean gzipBody, final int timeout) {
private static HashMap<String, String> transferURL(final yacySeed targetSeed, final URLMetadataRow[] urls, boolean gzipBody, final int timeout) {
// this post a message to the remote message board
final String address = targetSeed.getPublicAddress();
if (address == null) { return null; }

View File

@ -50,7 +50,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.data.Blacklist;
import de.anomic.plasma.plasmaSwitchboard;
public class yacyNewsPool {

View File

@ -51,8 +51,8 @@ import java.util.Set;
import java.util.TreeMap;
import de.anomic.crawler.ResultURLs;
import de.anomic.data.Blacklist;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSearchQuery;

View File

@ -57,9 +57,9 @@ import java.util.TreeMap;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.net.natLib;
import de.anomic.plasma.parser.Word;
import de.anomic.server.serverCodings;
import de.anomic.server.serverDomains;
import de.anomic.server.serverSystem;

View File

@ -35,10 +35,10 @@ import java.net.MalformedURLException;
import javax.imageio.ImageIO;
import de.anomic.kelondro.text.Document;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.Document;
import de.anomic.yacy.yacyURL;
public class ymageOSM {

View File

@ -56,12 +56,11 @@ import de.anomic.kelondro.blob.BLOBHeap;
import de.anomic.kelondro.blob.MapDataMining;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.ScoreCluster;
@ -70,6 +69,7 @@ import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.parser.Word;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSystem;
@ -689,13 +689,13 @@ public final class yacy {
wordIdxContainer = indexContainerIterator.next();
// the combined container will fit, read the container
final Iterator<ReferenceRow> wordIdxEntries = wordIdxContainer.entries();
final Iterator<WordReferenceRow> wordIdxEntries = wordIdxContainer.entries();
Reference iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = wordIdxEntries.next();
final String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final MetadataRowContainer urlEntry = currentUrlDB.load(urlHash, null, 0);
final URLMetadataRow urlEntry = currentUrlDB.load(urlHash, null, 0);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {
@ -705,7 +705,7 @@ public final class yacy {
}
if (wordCounter%500 == 0) {
wordChunkEndHash = wordIdxContainer.getWordHash();
wordChunkEndHash = wordIdxContainer.getTermHash();
wordChunkEnd = System.currentTimeMillis();
final long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " words scanned " +
@ -881,10 +881,10 @@ public final class yacy {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write((container.getWordHash()).getBytes());
bos.write((container.getTermHash()).getBytes());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash());
}
}
}
@ -898,17 +898,17 @@ public final class yacy {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write((container.getWordHash()).getBytes());
bos.write((container.getTermHash()).getBytes());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash());
}
}
}
bos.flush();
bos.close();
}
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getWordHash()));
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getTermHash()));
} catch (final IOException e) {
log.logSevere("IOException", e);
}