diff --git a/build.properties b/build.properties index 1e90a7963..2f1206b02 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.73 +releaseVersion=0.74 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index 5a77ce62d..5cfae65ae 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -45,11 +45,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import de.anomic.data.AbstractBlacklist; +import de.anomic.data.Blacklist; +import de.anomic.data.DefaultBlacklist; import de.anomic.data.listManager; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.Blacklist; -import de.anomic.kelondro.text.AbstractBlacklist; -import de.anomic.kelondro.text.DefaultBlacklist; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java index 22b25ac73..5a57baa88 100644 --- a/htroot/BlacklistTest_p.java +++ b/htroot/BlacklistTest_p.java @@ -32,9 +32,9 @@ import java.io.File; import java.net.MalformedURLException; +import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.Blacklist; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 4f75fe610..d04d17f59 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -38,10 +38,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import de.anomic.data.AbstractBlacklist; +import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.Blacklist; -import de.anomic.kelondro.text.AbstractBlacklist; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 3b178b5e6..d491ca3fd 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -41,8 +41,7 @@ import de.anomic.data.listManager; import de.anomic.data.userDB; import de.anomic.data.bookmarksDB.Tag; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaParserDocument; @@ -184,10 +183,10 @@ public class Bookmarks { final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - final MetadataRowContainer urlentry = sb.webIndex.metadata().load(urlHash, null, 0); + final URLMetadataRow urlentry = sb.webIndex.metadata().load(urlHash, null, 0); plasmaParserDocument document = null; if (urlentry != null) { - final URLMetadata metadata = urlentry.metadata(); + final URLMetadataRow.Components metadata = urlentry.metadata(); document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 3b31ebc2d..ce8df5eb2 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -31,8 +31,7 @@ import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -170,8 +169,8 @@ public class CrawlResults { String urlHash, initiatorHash, executorHash; String urlstr, urltxt; yacySeed initiatorSeed, executorSeed; - MetadataRowContainer urle; - URLMetadata metadata; + URLMetadataRow urle; + URLMetadataRow.Components metadata; int i, cnt = 0; for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 1bb1e4d9b..d4841faaa 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -34,21 +34,21 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import de.anomic.data.AbstractBlacklist; import de.anomic.data.listManager; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerCache; -import de.anomic.kelondro.text.ReferenceRow; -import de.anomic.kelondro.text.Word; -import de.anomic.kelondro.text.AbstractBlacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.plasma.plasmaSearchAPI; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; +import de.anomic.plasma.parser.Word; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -126,7 +126,7 @@ public class IndexControlRWIs_p { // generate an urlx array ReferenceContainer index = null; index = sb.webIndex.index().get(keyhash, null); - final Iterator en = index.entries(); + final Iterator en = index.entries(); int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { @@ -207,11 +207,11 @@ public class IndexControlRWIs_p { final long starttime = System.currentTimeMillis(); index = sb.webIndex.index().get(keyhash, null); // built urlCache - final Iterator urlIter = index.entries(); - final HashMap knownURLs = new HashMap(); + final Iterator urlIter = index.entries(); + final HashMap knownURLs = new HashMap(); final HashSet unknownURLEntries = new HashSet(); Reference iEntry; - MetadataRowContainer lurl; + URLMetadataRow lurl; while (urlIter.hasNext()) { iEntry = urlIter.next(); lurl = sb.webIndex.metadata().load(iEntry.urlHash(), null, 0); @@ -251,7 +251,7 @@ public class IndexControlRWIs_p { prop.put("keyhashsimilar", "1"); while (containerIt.hasNext() && i < 256) { container = containerIt.next(); - prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getWordHash()); + prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash()); cols++; if (cols==8) { prop.put("keyhashsimilar_rows_"+rows+"_cols", cols); @@ -278,7 +278,7 @@ public class IndexControlRWIs_p { yacyURL url; for (int i=0; i entryIt = new RotateIterator(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size()); + final Iterator entryIt = new RotateIterator(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); - MetadataRowContainer entry; + URLMetadataRow entry; int i = 0; int rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); @@ -286,15 +285,15 @@ public class IndexControlURLs_p { return prop; } - private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final MetadataRowContainer entry, final String urlhash) { + private static serverObjects genUrlProfile(final plasmaSwitchboard switchboard, final URLMetadataRow entry, final String urlhash) { final serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); return prop; } - final URLMetadata metadata = entry.metadata(); - final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0); + final URLMetadataRow.Components metadata = entry.metadata(); + final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.webIndex.metadata().load(entry.referrerHash(), null, 0); if (metadata.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); diff --git a/htroot/Supporter.java b/htroot/Supporter.java index ce5a2136a..311b9c651 100644 --- a/htroot/Supporter.java +++ b/htroot/Supporter.java @@ -31,11 +31,11 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; +import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.Row.Entry; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.kelondro.text.Blacklist; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.plasma.plasmaSwitchboard; diff --git a/htroot/Surftips.java b/htroot/Surftips.java index ee95a25c6..ab02b9f0a 100644 --- a/htroot/Surftips.java +++ b/htroot/Surftips.java @@ -31,11 +31,11 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; +import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.Row.Entry; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.kelondro.text.Blacklist; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.plasma.plasmaSwitchboard; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 6a4b02517..4eec2a5db 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -39,16 +39,15 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; -import de.anomic.kelondro.text.Document; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.FileUtils; -import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.plasma.parser.ParserException; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyURL; @@ -95,7 +94,7 @@ public class ViewFile { final String urlHash = post.get("urlHash",""); if (urlHash.length() > 0) { // getting the urlEntry that belongs to the url hash - MetadataRowContainer urlEntry = null; + URLMetadataRow urlEntry = null; urlEntry = sb.webIndex.metadata().load(urlHash, null, 0); if (urlEntry == null) { prop.put("error", "2"); @@ -104,7 +103,7 @@ public class ViewFile { } // getting the url that belongs to the entry - final URLMetadata metadata = urlEntry.metadata(); + final URLMetadataRow.Components metadata = urlEntry.metadata(); if ((metadata == null) || (metadata.url() == null)) { prop.put("error", "3"); prop.put("viewMode", VIEW_MODE_NO_TEXT); @@ -114,7 +113,7 @@ public class ViewFile { descr = metadata.dc_title(); urlEntry.wordCount(); size = urlEntry.size(); - pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof); + pre = urlEntry.flags().get(Condenser.flag_cat_indexof); } // alternatively, get the url simply from a url String @@ -312,7 +311,7 @@ public class ViewFile { // Search word highlighting while (sentences.hasNext()) { sentence = sentences.next().toString(); - Enumeration tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8"); + Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8"); while (tokens.hasMoreElements()) { token = tokens.nextElement().toString(); if (token.length() > 0) { diff --git a/htroot/api/blacklists_p.java b/htroot/api/blacklists_p.java index 64f5c8058..ab591d010 100644 --- a/htroot/api/blacklists_p.java +++ b/htroot/api/blacklists_p.java @@ -2,9 +2,9 @@ import java.io.File; import java.util.List; +import de.anomic.data.AbstractBlacklist; import de.anomic.data.listManager; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.AbstractBlacklist; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 6dc39bd05..79017a91d 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -31,7 +31,7 @@ import java.util.TreeSet; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSwitchboard; @@ -85,8 +85,8 @@ public final class timeline { localSearchContainerMaps[1].values(), maxdist); - Iterator i = index.entries(); - ReferenceRow entry; + Iterator i = index.entries(); + WordReferenceRow entry; int c = 0; Date lm; String lms; diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index ce7e81003..2ad240e12 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -28,8 +28,7 @@ import java.net.MalformedURLException; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -69,14 +68,14 @@ public class yacydoc { } if (urlhash == null || urlhash.length() == 0) return prop; - final MetadataRowContainer entry = sb.webIndex.metadata().load(urlhash, null, 0); + final URLMetadataRow entry = sb.webIndex.metadata().load(urlhash, null, 0); if (entry == null) return prop; - final URLMetadata metadata = entry.metadata(); + final URLMetadataRow.Components metadata = entry.metadata(); if (metadata.url() == null) { return prop; } - final MetadataRowContainer le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0); + final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.webIndex.metadata().load(entry.referrerHash(), null, 0); prop.putXML("dc_title", metadata.dc_title()); prop.putXML("dc_creator", metadata.dc_creator()); diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 601370b2d..7e474b4ec 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -38,11 +38,11 @@ import java.util.HashSet; import java.util.List; import de.anomic.crawler.HTTPLoader; +import de.anomic.data.AbstractBlacklist; import de.anomic.data.listManager; import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.AbstractBlacklist; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index d97c62669..4af85dc01 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -31,8 +31,7 @@ import java.io.IOException; import de.anomic.crawler.ZURL; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -113,14 +112,14 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final MetadataRowContainer entry = MetadataRowContainer.importEntry(propStr); + final URLMetadataRow entry = URLMetadataRow.importEntry(propStr); if (entry == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); return prop; } - final URLMetadata metadata = entry.metadata(); + final URLMetadataRow.Components metadata = entry.metadata(); if (metadata.url() == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 60b1c209b..095f3ff29 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -32,9 +32,9 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.ReferenceRow; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; @@ -127,7 +127,7 @@ public final class transferRWI { int p; String wordHash; String urlHash; - ReferenceRow iEntry; + WordReferenceRow iEntry; final HashSet unknownURL = new HashSet(); final HashSet knownURL = new HashSet(); final String[] wordhashes = new String[v.size()]; @@ -147,7 +147,7 @@ public final class transferRWI { } wordHash = estring.substring(0, p); wordhashes[received] = wordHash; - iEntry = new ReferenceRow(estring.substring(p)); + iEntry = new WordReferenceRow(estring.substring(p)); urlHash = iEntry.urlHash(); // block blacklisted entries diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 6822e1c3c..b0a50f438 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -29,10 +29,9 @@ import java.io.IOException; import java.text.ParseException; +import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; @@ -85,7 +84,7 @@ public final class transferURL { final int sizeBefore = sb.webIndex.metadata().size(); // read the urls from the other properties and store String urls; - MetadataRowContainer lEntry; + URLMetadataRow lEntry; for (int i = 0; i < urlc; i++) { serverCore.checkInterruption(); @@ -98,7 +97,7 @@ public final class transferURL { } // parse new lurl-entry - lEntry = MetadataRowContainer.importEntry(urls); + lEntry = URLMetadataRow.importEntry(urls); if (lEntry == null) { yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); blocked++; @@ -106,7 +105,7 @@ public final class transferURL { } // check if entry is well-formed - final URLMetadata metadata = lEntry.metadata(); + final URLMetadataRow.Components metadata = lEntry.metadata(); if (metadata.url() == null) { yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls); blocked++; diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 2f6dec035..b40a3e667 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -30,8 +30,7 @@ import java.util.Date; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.NoticedURL; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -109,8 +108,8 @@ public class urls { if (urlhashes.length() % 12 != 0) return prop; final int count = urlhashes.length() / 12; int c = 0; - MetadataRowContainer entry; - URLMetadata metadata; + URLMetadataRow entry; + URLMetadataRow.Components metadata; yacyURL referrer; for (int i = 0; i < count; i++) { entry = sb.webIndex.metadata().load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index cf33ef6fb..01645e1d1 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -33,13 +33,10 @@ import java.util.TreeSet; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.SetTools; import de.anomic.kelondro.util.Log; -import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaProfiling; import de.anomic.plasma.plasmaSearchEvent; @@ -48,6 +45,8 @@ import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverCore; import de.anomic.server.serverDomains; import de.anomic.server.serverObjects; @@ -164,7 +163,7 @@ public class yacysearch { Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null; if (indexof) { constraint = new Bitfield(4); - constraint.set(plasmaCondenser.flag_cat_indexof, true); + constraint.set(Condenser.flag_cat_indexof, true); } // SEARCH @@ -342,9 +341,9 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - final MetadataRowContainer urlentry = sb.webIndex.metadata().load(recommendHash, null, 0); + final URLMetadataRow urlentry = sb.webIndex.metadata().load(recommendHash, null, 0); if (urlentry != null) { - final URLMetadata metadata = urlentry.metadata(); + final URLMetadataRow.Components metadata = urlentry.metadata(); plasmaParserDocument document; document = plasmaSnippetCache.retrieveDocument(metadata.url(), true, 5000, true, false); if (document != null) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 310a6c726..5478d8f0b 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -38,13 +38,13 @@ import java.util.concurrent.ConcurrentHashMap; import de.anomic.http.httpClient; import de.anomic.kelondro.table.FlexWidthArray; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; +import de.anomic.plasma.parser.Document; import de.anomic.server.serverProcessorJob; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 5a807d1ca..b057a41fe 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -31,8 +31,8 @@ package de.anomic.crawler; import java.net.UnknownHostException; import java.util.Date; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.data.Blacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; @@ -244,7 +244,7 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = nextQueue.urlExists(entry.url().hash()); if (dbocc != null || wordIndex.metadata().exists(entry.url().hash())) { - final MetadataRowContainer oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0); + final URLMetadataRow oldEntry = wordIndex.metadata().load(entry.url().hash(), null, 0); final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); // do double-check if ((dbocc != null) && (!recrawl)) { diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 5f1e415ed..c2bf817a1 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -35,13 +35,13 @@ import java.util.Date; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpdProxyCacheEntry; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.net.ftpc; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.yacy.yacyURL; public class FTPLoader { diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index f8c12495b..066a028b0 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -28,17 +28,17 @@ package de.anomic.crawler; import java.io.IOException; import java.util.Date; +import de.anomic.data.Blacklist; import de.anomic.http.httpClient; import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpdProxyCacheEntry; -import de.anomic.kelondro.text.Blacklist; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.yacy.yacyURL; public final class HTTPLoader { diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index bd9fadec1..c28dc392a 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -39,7 +39,7 @@ import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.NaturalOrder; import de.anomic.kelondro.table.Stack; -import de.anomic.kelondro.text.MetadataRowContainer; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaHTCache; @@ -352,7 +352,7 @@ public class IndexingStack { if (referrerURL == null) { // FIXME the equals seems to be incorrect: String.equals(boolean) if ((referrerHash == null) || ((initiator != null) && (referrerHash.equals(initiator.length() == 0)))) return null; - final MetadataRowContainer entry = wordIndex.metadata().load(referrerHash, null, 0); + final URLMetadataRow entry = wordIndex.metadata().load(referrerHash, null, 0); if (entry == null) referrerURL = null; else referrerURL = entry.metadata().url(); } return referrerURL; diff --git a/source/de/anomic/crawler/LoaderMessage.java b/source/de/anomic/crawler/LoaderMessage.java index 8e9482ba0..77ec51bdd 100644 --- a/source/de/anomic/crawler/LoaderMessage.java +++ b/source/de/anomic/crawler/LoaderMessage.java @@ -23,7 +23,7 @@ package de.anomic.crawler; -import de.anomic.kelondro.text.Document; +import de.anomic.plasma.parser.Document; import de.anomic.server.serverSemaphore; import de.anomic.yacy.yacyURL; diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/ProtocolLoader.java index 450254a91..37213b575 100644 --- a/source/de/anomic/crawler/ProtocolLoader.java +++ b/source/de/anomic/crawler/ProtocolLoader.java @@ -33,9 +33,9 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.server.serverCore; import de.anomic.server.serverProcessorJob; diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index eba25090d..695f65a34 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -40,7 +40,7 @@ import java.util.LinkedList; import java.util.List; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.MetadataRowContainer; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacySeedDB; @@ -82,7 +82,7 @@ public final class ResultURLs { gcrawlResultDomains = new ScoreCluster(); } - public synchronized void stack(final MetadataRowContainer e, final String initiatorHash, final String executorHash, final int stackType) { + public synchronized void stack(final URLMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) { assert initiatorHash != null; assert executorHash != null; if (e == null) { return; } @@ -305,7 +305,7 @@ public final class ResultURLs { final ResultURLs results = new ResultURLs(); try { final yacyURL url = new yacyURL("http", "www.yacy.net", 80, "/"); - final MetadataRowContainer urlRef = new MetadataRowContainer(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0); + final URLMetadataRow urlRef = new URLMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0); int stackNo = 1; System.out.println("valid test:\n======="); // add diff --git a/source/de/anomic/kelondro/text/AbstractBlacklist.java b/source/de/anomic/data/AbstractBlacklist.java similarity index 97% rename from source/de/anomic/kelondro/text/AbstractBlacklist.java rename to source/de/anomic/data/AbstractBlacklist.java index 9f3983d12..519eb25db 100644 --- a/source/de/anomic/kelondro/text/AbstractBlacklist.java +++ b/source/de/anomic/data/AbstractBlacklist.java @@ -25,7 +25,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.data; import java.io.File; import java.io.IOException; diff --git a/source/de/anomic/kelondro/text/Blacklist.java b/source/de/anomic/data/Blacklist.java similarity index 98% rename from source/de/anomic/kelondro/text/Blacklist.java rename to source/de/anomic/data/Blacklist.java index 9b63b41b8..fb286b5f1 100644 --- a/source/de/anomic/kelondro/text/Blacklist.java +++ b/source/de/anomic/data/Blacklist.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.data; import java.io.File; import java.util.Arrays; diff --git a/source/de/anomic/kelondro/text/DefaultBlacklist.java b/source/de/anomic/data/DefaultBlacklist.java similarity index 96% rename from source/de/anomic/kelondro/text/DefaultBlacklist.java rename to source/de/anomic/data/DefaultBlacklist.java index 2719c6151..95ea8560f 100644 --- a/source/de/anomic/kelondro/text/DefaultBlacklist.java +++ b/source/de/anomic/data/DefaultBlacklist.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.data; import java.io.File; import java.util.ArrayList; @@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException; + public class DefaultBlacklist extends AbstractBlacklist implements Blacklist { public DefaultBlacklist(final File rootPath) { diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index e5df6a962..60a2d00bd 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -45,7 +45,7 @@ import de.anomic.http.httpClient; import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpdByteCountInputStream; -import de.anomic.kelondro.text.MetadataRowContainer; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; @@ -260,7 +260,7 @@ public class SitemapParser extends DefaultHandler { final String dbocc = this.sb.urlExists(nexturlhash); if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { // the url was already loaded. we need to check the date - final MetadataRowContainer oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0); + final URLMetadataRow oldEntry = this.sb.webIndex.metadata().load(nexturlhash, null, 0); if (oldEntry != null) { final Date modDate = oldEntry.moddate(); // check if modDate is null diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 110532618..fdca44116 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -55,9 +55,9 @@ import de.anomic.kelondro.index.IntegerHandleIndex; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.text.IndexCollection; import de.anomic.kelondro.text.MetadataRepository; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.MetadataRepository.Export; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.yacy.yacyURL; @@ -396,7 +396,7 @@ public class URLAnalysis { "collection", 12, Base64Order.enhancedCoder, - ReferenceRow.urlEntryRow); + WordReferenceRow.urlEntryRow); System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics"); idx.dump(new File(statisticPath)); System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); @@ -407,9 +407,9 @@ public class URLAnalysis { public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException { System.out.println("COLLECTION INDEX DIFF URL-COL startup"); - IntegerHandleIndex idx = new IntegerHandleIndex(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(statisticFile), 0); + IntegerHandleIndex idx = new IntegerHandleIndex(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(statisticFile), 0); MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, 0, 1000000); + HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000); System.out.println("COLLECTION INDEX DIFF URL-COL loaded dump, starting diff"); long start = System.currentTimeMillis(); long update = start - 7000; @@ -436,7 +436,7 @@ public class URLAnalysis { // format: 0=text, 1=html, 2=rss/xml System.out.println("URL EXPORT startup"); MetadataRepository mr = new MetadataRepository(new File(metadataPath)); - HandleSet hs = (diffFile == null) ? null : new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0); + HandleSet hs = (diffFile == null) ? null : new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0); System.out.println("URL EXPORT loaded dump, starting export"); Export e = mr.export(new File(export), ".*", hs, format, false); try { @@ -451,7 +451,7 @@ public class URLAnalysis { System.out.println("URL DELETE startup"); MetadataRepository mr = new MetadataRepository(new File(metadataPath)); int mrSize = mr.size(); - HandleSet hs = new HandleSet(MetadataRowContainer.rowdef.primaryKeyLength, MetadataRowContainer.rowdef.objectOrder, new File(diffFile), 0); + HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0); System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); for (byte[] refhash: hs) { mr.remove(new String(refhash)); diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 85bffea96..9eb3f722f 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -68,12 +68,12 @@ import de.anomic.kelondro.blob.BLOBTree; import de.anomic.kelondro.blob.MapView; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.kelondro.text.Word; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.kelondroException; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Word; import de.anomic.server.serverBusyThread; import de.anomic.server.serverInstantBusyThread; import de.anomic.yacy.yacyNewsPool; diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java index a20a50039..d6d0776d9 100644 --- a/source/de/anomic/data/listManager.java +++ b/source/de/anomic/data/listManager.java @@ -42,8 +42,7 @@ import java.util.List; import java.util.Set; import java.util.Vector; -import de.anomic.kelondro.text.AbstractBlacklist; -import de.anomic.kelondro.text.Blacklist.blacklistFile; +import de.anomic.data.Blacklist.blacklistFile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; diff --git a/source/de/anomic/http/httpdProxyCacheEntry.java b/source/de/anomic/http/httpdProxyCacheEntry.java index f40e3a112..941287b68 100755 --- a/source/de/anomic/http/httpdProxyCacheEntry.java +++ b/source/de/anomic/http/httpdProxyCacheEntry.java @@ -29,9 +29,9 @@ package de.anomic.http; import java.util.Date; import de.anomic.crawler.CrawlProfile; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.parser.Document; import de.anomic.yacy.yacyURL; public class httpdProxyCacheEntry implements Document { diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 3f2c27a67..1ce0d2442 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -72,10 +72,9 @@ import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; import de.anomic.crawler.HTTPLoader; +import de.anomic.data.Blacklist; import de.anomic.htmlFilter.htmlFilterContentTransformer; import de.anomic.htmlFilter.htmlFilterTransformer; -import de.anomic.kelondro.text.Blacklist; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; @@ -83,6 +82,7 @@ import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; +import de.anomic.plasma.parser.Document; import de.anomic.server.serverCore; import de.anomic.server.serverDomains; import de.anomic.server.serverObjects; diff --git a/source/de/anomic/icap/icapd.java b/source/de/anomic/icap/icapd.java index 8c1f4548d..4f6fc105e 100644 --- a/source/de/anomic/icap/icapd.java +++ b/source/de/anomic/icap/icapd.java @@ -40,13 +40,13 @@ import de.anomic.http.httpChunkedInputStream; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpdProxyCacheEntry; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.server.serverCore; import de.anomic.server.serverHandler; import de.anomic.server.serverCore.Session; diff --git a/source/de/anomic/kelondro/blob/BLOBArray.java b/source/de/anomic/kelondro/blob/BLOBArray.java index 0c06f6c52..499ddda4d 100755 --- a/source/de/anomic/kelondro/blob/BLOBArray.java +++ b/source/de/anomic/kelondro/blob/BLOBArray.java @@ -609,30 +609,30 @@ public class BLOBArray implements BLOB { while (true) { assert c1 != null; assert c2 != null; - e = ordering.compare(c1.getWordHash().getBytes(), c2.getWordHash().getBytes()); + e = ordering.compare(c1.getTermHash().getBytes(), c2.getTermHash().getBytes()); if (e < 0) { - writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); + writer.add(c1.getTermHash().getBytes(), c1.exportCollection()); if (i1.hasNext()) { c1o = c1; c1 = i1.next(); - assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; + assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0; continue; } break; } if (e > 0) { - writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); + writer.add(c2.getTermHash().getBytes(), c2.exportCollection()); if (i2.hasNext()) { c2o = c2; c2 = i2.next(); - assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; + assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0; continue; } break; } assert e == 0; // merge the entries - writer.add(c1.getWordHash().getBytes(), (c1.merge(c2)).exportCollection()); + writer.add(c1.getTermHash().getBytes(), (c1.merge(c2)).exportCollection()); if (i1.hasNext() && i2.hasNext()) { c1 = i1.next(); c2 = i2.next(); @@ -647,22 +647,22 @@ public class BLOBArray implements BLOB { assert !(i1.hasNext() && i2.hasNext()); while (i1.hasNext()) { //System.out.println("FLUSH REMAINING 1: " + c1.getWordHash()); - writer.add(c1.getWordHash().getBytes(), c1.exportCollection()); + writer.add(c1.getTermHash().getBytes(), c1.exportCollection()); if (i1.hasNext()) { c1o = c1; c1 = i1.next(); - assert ordering.compare(c1.getWordHash().getBytes(), c1o.getWordHash().getBytes()) > 0; + assert ordering.compare(c1.getTermHash().getBytes(), c1o.getTermHash().getBytes()) > 0; continue; } break; } while (i2.hasNext()) { //System.out.println("FLUSH REMAINING 2: " + c2.getWordHash()); - writer.add(c2.getWordHash().getBytes(), c2.exportCollection()); + writer.add(c2.getTermHash().getBytes(), c2.exportCollection()); if (i2.hasNext()) { c2o = c2; c2 = i2.next(); - assert ordering.compare(c2.getWordHash().getBytes(), c2o.getWordHash().getBytes()) > 0; + assert ordering.compare(c2.getTermHash().getBytes(), c2o.getTermHash().getBytes()) > 0; continue; } break; diff --git a/source/de/anomic/kelondro/text/BufferedIndexCollection.java b/source/de/anomic/kelondro/text/BufferedIndexCollection.java index bc2ca4484..9858a2790 100644 --- a/source/de/anomic/kelondro/text/BufferedIndexCollection.java +++ b/source/de/anomic/kelondro/text/BufferedIndexCollection.java @@ -44,7 +44,7 @@ import de.anomic.kelondro.text.IndexBuffer; import de.anomic.kelondro.text.IndexCollection; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerOrder; -import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.Log; @@ -94,21 +94,21 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme 12, Base64Order.enhancedCoder, maxCollectionPartition, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, useCommons); } /* methods for interface Index */ public void add(final ReferenceContainer entries) { - assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize); + assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize); // add the entry buffer.add(entries); cacheFlushControl(); } - public void add(final String wordHash, final ReferenceRow entry) throws IOException { + public void add(final String wordHash, final WordReferenceRow entry) throws IOException { // add the entry buffer.add(wordHash, entry); cacheFlushControl(); @@ -151,10 +151,10 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme for (int i = 0; i < d.size(); i++) { // for each element in the double-set, take that one that is the most recent one set = d.get(i); - ReferenceRow e, elm = null; + WordReferenceRow e, elm = null; long lm = 0; for (int j = 0; j < set.size(); j++) { - e = new ReferenceRow(set.get(j, true)); + e = new WordReferenceRow(set.get(j, true)); if ((elm == null) || (e.lastModified() > lm)) { elm = e; lm = e.lastModified(); @@ -164,7 +164,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme container.addUnique(elm.toKelondroEntry()); } } - if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash()); + if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHash()); return container; } @@ -172,7 +172,7 @@ public final class BufferedIndexCollection extends AbstractBufferedIndex impleme public ReferenceContainer delete(final String wordHash) { final ReferenceContainer c = new ReferenceContainer( wordHash, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, buffer.count(wordHash)); c.addAllUnique(buffer.delete(wordHash)); c.addAllUnique(collections.delete(wordHash)); diff --git a/source/de/anomic/kelondro/text/Index.java b/source/de/anomic/kelondro/text/Index.java index 01ee02a52..f56dc13e0 100644 --- a/source/de/anomic/kelondro/text/Index.java +++ b/source/de/anomic/kelondro/text/Index.java @@ -34,6 +34,7 @@ import java.util.TreeSet; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; public interface Index { @@ -52,72 +53,72 @@ public interface Index { * if no references to the word are stored, the a new entry is added, * if there are already references to the word hash stored, * then the old and the new references are merged - * @param wordHash + * @param termHash * @param entry * @throws IOException */ - public void add(final String wordHash, final ReferenceRow entry) throws IOException; + public void add(final String termHash, final WordReferenceRow entry) throws IOException; /** * check if there are references stored to the given word hash - * @param wordHash + * @param termHash * @return true if references exist, false if not */ - public boolean has(String wordHash); // should only be used if in case that true is returned the getContainer is NOT called + public boolean has(String termHash); // should only be used if in case that true is returned the getContainer is NOT called /** * count the number of references for the given word * do not use this method to check the existence of a reference by comparing * the result with zero, use hasReferences instead. - * @param wordHash + * @param termHash * @return the number of references to the given word */ - public int count(final String wordHash); + public int count(final String termHash); /** * get the references to a given word. * if referenceselection is not null, then all url references which are not * in referenceselection are removed from the container - * @param wordHash + * @param termHash * @param referenceselection * @return the references * @throws IOException */ - public ReferenceContainer get(String wordHash, Set referenceselection) throws IOException; + public ReferenceContainer get(String termHash, Set referenceselection) throws IOException; /** * delete all references for a word - * @param wordHash + * @param termHash * @return the deleted references * @throws IOException */ - public ReferenceContainer delete(String wordHash) throws IOException; + public ReferenceContainer delete(String termHash) throws IOException; /** * remove a specific reference entry - * @param wordHash + * @param termHash * @param referenceHash the key for the reference entry to be removed * @return * @throws IOException */ - public boolean remove(String wordHash, String referenceHash) throws IOException; + public boolean remove(String termHash, String referenceHash) throws IOException; /** * remove a set of reference entries for a given word - * @param wordHash the key for the references + * @param termHash the key for the references * @param referenceHash the reference entry keys * @return * @throws IOException */ - public int remove(String wordHash, Set referenceHashes) throws IOException; + public int remove(String termHash, Set referenceHashes) throws IOException; - public int remove(final Set wordHashes, final String urlHash) throws IOException; + public int remove(final Set termHashes, final String urlHash) throws IOException; - public void remove(final Set wordHashes, final Set urlHashes) throws IOException; + public void remove(final Set termHashes, final Set urlHashes) throws IOException; /** * iterate all references from the beginning of a specific word hash - * @param startWordHash + * @param startHash * @param rot if true, then rotate at the end to the beginning * @param ram * @return diff --git a/source/de/anomic/kelondro/text/IndexBuffer.java b/source/de/anomic/kelondro/text/IndexBuffer.java index f34ef2a2e..32da6d50e 100644 --- a/source/de/anomic/kelondro/text/IndexBuffer.java +++ b/source/de/anomic/kelondro/text/IndexBuffer.java @@ -35,6 +35,7 @@ import java.util.Set; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; @@ -94,8 +95,8 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead } else if (dumpFile.exists()) { // initialize scores for cache organization for (final ReferenceContainer ic : (Iterable) heap.references(null, false)) { - this.hashDate.setScore(ic.getWordHash(), intTime(ic.lastWrote())); - this.hashScore.setScore(ic.getWordHash(), ic.size()); + this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote())); + this.hashScore.setScore(ic.getTermHash(), ic.size()); } } else { heap.initWriteMode(); @@ -197,7 +198,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead } if (hash == null) { final ReferenceContainer ic = heap.references(null, false).next(); - if (ic != null) hash = ic.getWordHash(); + if (ic != null) hash = ic.getTermHash(); } return hash; @@ -304,11 +305,11 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead // put new words into cache heap.add(container); - hashScore.setScore(container.getWordHash(), heap.count(container.getWordHash())); - hashDate.setScore(container.getWordHash(), intTime(System.currentTimeMillis())); + hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash())); + hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis())); } - public void add(final String wordHash, final ReferenceRow entry) throws IOException { + public void add(final String wordHash, final WordReferenceRow entry) throws IOException { if (entry == null || heap == null) return; // put new words into cache @@ -335,7 +336,7 @@ public final class IndexBuffer extends AbstractIndex implements Index, IndexRead public synchronized long getBufferSizeBytes() { // calculate the real size in bytes of the index cache long cacheBytes = 0; - final long entryBytes = ReferenceRow.urlEntryRow.objectsize; + final long entryBytes = WordReferenceRow.urlEntryRow.objectsize; final Iterator it = references(null, false); while (it.hasNext()) cacheBytes += it.next().size() * entryBytes; return cacheBytes; diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 9a5b9e4b5..11d397cca 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -36,6 +36,7 @@ import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.Order; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.server.serverProfiling; @@ -65,15 +66,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn public IndexCell( final File cellPath, - final ByteOrder wordOrder, + final ByteOrder termOrder, final Row payloadrow, final int maxRamEntries, final long targetFileSize, final long maxFileSize, IODispatcher merger ) throws IOException { - this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow, merger); - this.ram = new ReferenceContainerCache(payloadrow, wordOrder); + this.array = new ReferenceContainerArray(cellPath, termOrder, payloadrow, merger); + this.ram = new ReferenceContainerCache(payloadrow, termOrder); this.ram.initWriteMode(); this.maxRamEntries = maxRamEntries; this.merger = merger; @@ -99,25 +100,25 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn cleanCache(); } - public synchronized void add(String hash, ReferenceRow entry) throws IOException { + public synchronized void add(String hash, WordReferenceRow entry) throws IOException { this.ram.add(hash, entry); serverProfiling.update("wordcache", Long.valueOf(this.ram.size()), true); cleanCache(); } /** - * checks if there is any container for this wordHash, either in RAM or any BLOB + * checks if there is any container for this termHash, either in RAM or any BLOB */ - public boolean has(String wordHash) { - if (this.ram.has(wordHash)) return true; - return this.array.has(wordHash); + public boolean has(String termHash) { + if (this.ram.has(termHash)) return true; + return this.array.has(termHash); } - public int count(String wordHash) { - ReferenceContainer c0 = this.ram.get(wordHash, null); + public int count(String termHash) { + ReferenceContainer c0 = this.ram.get(termHash, null); ReferenceContainer c1; try { - c1 = this.array.get(wordHash); + c1 = this.array.get(termHash); } catch (IOException e) { c1 = null; } @@ -133,9 +134,9 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn * all containers in the BLOBs and the RAM are merged and returned * @throws IOException */ - public ReferenceContainer get(String wordHash, Set urlselection) throws IOException { - ReferenceContainer c0 = this.ram.get(wordHash, null); - ReferenceContainer c1 = this.array.get(wordHash); + public ReferenceContainer get(String termHash, Set urlselection) throws IOException { + ReferenceContainer c0 = this.ram.get(termHash, null); + ReferenceContainer c1 = this.array.get(termHash); if (c1 == null) { if (c0 == null) return null; return c0; @@ -149,14 +150,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn * the deleted containers are merged and returned as result of the method * @throws IOException */ - public ReferenceContainer delete(String wordHash) throws IOException { - ReferenceContainer c0 = this.ram.delete(wordHash); - ReferenceContainer c1 = this.array.get(wordHash); + public ReferenceContainer delete(String termHash) throws IOException { + ReferenceContainer c0 = this.ram.delete(termHash); + ReferenceContainer c1 = this.array.get(termHash); if (c1 == null) { if (c0 == null) return null; return c0; } - this.array.delete(wordHash); + this.array.delete(termHash); cleanCache(); if (c0 == null) return c1; return c1.merge(c0); @@ -169,13 +170,13 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn * new BLOBs. This returns the sum of all url references that have been removed * @throws IOException */ - public int remove(String wordHash, Set urlHashes) throws IOException { - int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHashes)); + public int remove(String termHash, Set urlHashes) throws IOException { + int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashes)); return reduced / this.array.rowdef().objectsize; } - public boolean remove(String wordHash, String urlHash) throws IOException { - int reduced = this.array.replace(wordHash, new RemoveRewriter(urlHash)); + public boolean remove(String termHash, String urlHash) throws IOException { + int reduced = this.array.replace(termHash, new RemoveRewriter(urlHash)); return reduced > 0; } @@ -199,14 +200,14 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn } - public CloneableIterator references(String startWordHash, boolean rot) { + public CloneableIterator references(String starttermHash, boolean rot) { final Order containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone()); - containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0)); + containerOrder.rotate(new ReferenceContainer(starttermHash, this.ram.rowdef(), 0)); return new MergeIterator( - this.ram.references(startWordHash, rot), + this.ram.references(starttermHash, rot), new MergeIterator( - this.ram.references(startWordHash, false), - this.array.wordContainerIterator(startWordHash, false, false), + this.ram.references(starttermHash, false), + this.array.wordContainerIterator(starttermHash, false, false), containerOrder, ReferenceContainer.containerMergeMethod, true), @@ -215,15 +216,15 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn true); } - public CloneableIterator references(String startWordHash, boolean rot, boolean ram) { + public CloneableIterator references(String startTermHash, boolean rot, boolean ram) { final Order containerOrder = new ReferenceContainerOrder(this.ram.rowdef().getOrdering().clone()); - containerOrder.rotate(new ReferenceContainer(startWordHash, this.ram.rowdef(), 0)); + containerOrder.rotate(new ReferenceContainer(startTermHash, this.ram.rowdef(), 0)); if (ram) { - return this.ram.references(startWordHash, rot); + return this.ram.references(startTermHash, rot); } return new MergeIterator( - this.ram.references(startWordHash, false), - this.array.wordContainerIterator(startWordHash, false, false), + this.ram.references(startTermHash, false), + this.array.wordContainerIterator(startTermHash, false, false), containerOrder, ReferenceContainer.containerMergeMethod, true); @@ -317,27 +318,22 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn return System.currentTimeMillis(); } - public int getBufferMaxReferences() { return this.ram.maxReferences(); } - public long getBufferMinAge() { return System.currentTimeMillis(); } - public int getBufferSize() { return this.ram.size(); } - public long getBufferSizeBytes() { return 10000 * this.ram.size(); // guessed; we don't know that exactly because there is no statistics here (expensive, not necessary) } - public void setBufferMaxWordCount(int maxWords) { this.maxRamEntries = maxWords; } diff --git a/source/de/anomic/kelondro/text/IndexCollection.java b/source/de/anomic/kelondro/text/IndexCollection.java index 6321141c3..506d062e2 100644 --- a/source/de/anomic/kelondro/text/IndexCollection.java +++ b/source/de/anomic/kelondro/text/IndexCollection.java @@ -54,6 +54,7 @@ import de.anomic.kelondro.order.RotateIterator; import de.anomic.kelondro.table.EcoTable; import de.anomic.kelondro.table.FixedWidthArray; import de.anomic.kelondro.table.FlexTable; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.kelondroException; @@ -250,10 +251,12 @@ public class IndexCollection extends AbstractIndex implements Index { } } - public void add(String wordhash, ReferenceRow entry) { + public void add(String wordhash, WordReferenceRow entry) { if (entry == null) return; try { - this.merge(new ReferenceContainer(wordhash, entry)); + ReferenceContainer container = new ReferenceContainer(wordhash, this.payloadrow, 1); + container.add(entry); + this.merge(container); } catch (final kelondroOutOfLimitsException e) { e.printStackTrace(); } catch (final IOException e) { @@ -704,7 +707,7 @@ public class IndexCollection extends AbstractIndex implements Index { private synchronized void merge(final ReferenceContainer container) throws IOException, kelondroOutOfLimitsException { if ((container == null) || (container.size() == 0)) return; - final byte[] key = container.getWordHash().getBytes(); + final byte[] key = container.getTermHash().getBytes(); // first find an old entry, if one exists Row.Entry indexrow = index.get(key); diff --git a/source/de/anomic/kelondro/text/IndexCollectionMigration.java b/source/de/anomic/kelondro/text/IndexCollectionMigration.java index 602b26836..9e79e70f8 100644 --- a/source/de/anomic/kelondro/text/IndexCollectionMigration.java +++ b/source/de/anomic/kelondro/text/IndexCollectionMigration.java @@ -41,7 +41,7 @@ import de.anomic.kelondro.text.Index; import de.anomic.kelondro.text.IndexCollection; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerOrder; -import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.Log; @@ -66,7 +66,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem this.cell = new IndexCell( celldir, wordOrdering, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, entityCacheMaxSize, targetFileSize, maxFileSize, @@ -104,7 +104,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem 12, Base64Order.enhancedCoder, BufferedIndexCollection.maxCollectionPartition, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, false); if (this.collections.size() == 0) { // delete everything here @@ -126,10 +126,10 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem /* methods for interface Index */ public void add(final ReferenceContainer entries) throws IOException { - assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize); + assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize); if (this.collections != null) { - ReferenceContainer e = this.collections.delete(entries.getWordHash()); + ReferenceContainer e = this.collections.delete(entries.getTermHash()); if (e != null) { e.merge(entries); cell.add(e); @@ -141,7 +141,7 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem } } - public void add(final String wordHash, final ReferenceRow entry) throws IOException { + public void add(final String wordHash, final WordReferenceRow entry) throws IOException { if (this.collections != null) { ReferenceContainer e = this.collections.delete(wordHash); if (e != null) { diff --git a/source/de/anomic/kelondro/text/Metadata.java b/source/de/anomic/kelondro/text/Metadata.java new file mode 100644 index 000000000..4eccd83b1 --- /dev/null +++ b/source/de/anomic/kelondro/text/Metadata.java @@ -0,0 +1,89 @@ +// Metadata.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 03.04.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.text; + +import java.util.Date; + +import de.anomic.crawler.CrawlEntry; +import de.anomic.kelondro.index.Row; +import de.anomic.kelondro.order.Bitfield; +import de.anomic.kelondro.text.Reference; + +public interface Metadata { + + + public Row.Entry toRowEntry(); + + public String hash(); + + public long ranking(); + + public Date moddate(); + + public Date loaddate(); + + public Date freshdate(); + + public String referrerHash(); + + public String md5(); + + public char doctype(); + + public String language(); + + public int size(); + + public Bitfield flags(); + + public int wordCount(); + + public int llocal(); + + public int lother(); + + public int limage(); + + public int laudio(); + + public int lvideo(); + + public int lapp(); + + public String snippet(); + + public Reference word(); + + public boolean isOlder(final Metadata other); + + public String toString(final String snippet); + + public CrawlEntry toBalancerEntry(final String initiatorHash); + + public String toString(); + +} diff --git a/source/de/anomic/kelondro/text/MetadataRepository.java b/source/de/anomic/kelondro/text/MetadataRepository.java index bf616dd0f..ed11f772d 100644 --- a/source/de/anomic/kelondro/text/MetadataRepository.java +++ b/source/de/anomic/kelondro/text/MetadataRepository.java @@ -38,6 +38,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import de.anomic.data.Blacklist; import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.http.httpClient; import de.anomic.http.httpResponse; @@ -48,6 +49,7 @@ import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.ObjectIndex; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.table.SplitTable; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacyURL; @@ -62,7 +64,7 @@ public final class MetadataRepository implements Iterable { public MetadataRepository(final File path) { this.location = path; - this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", MetadataRowContainer.rowdef, false)); + this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", URLMetadataRow.rowdef, false)); this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; @@ -97,7 +99,7 @@ public final class MetadataRepository implements Iterable { return 0; } - public synchronized MetadataRowContainer load(final String urlHash, final Reference searchedWord, final long ranking) { + public synchronized URLMetadataRow load(final String urlHash, final Reference searchedWord, final long ranking) { // generates an plasmaLURLEntry using the url hash // if the url cannot be found, this returns null if (urlHash == null) return null; @@ -105,15 +107,15 @@ public final class MetadataRepository implements Iterable { try { final Row.Entry entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; - return new MetadataRowContainer(entry, searchedWord, ranking); + return new URLMetadataRow(entry, searchedWord, ranking); } catch (final IOException e) { return null; } } - public synchronized void store(final MetadataRowContainer entry) throws IOException { + public synchronized void store(final URLMetadataRow entry) throws IOException { // Check if there is a more recent Entry already in the DB - MetadataRowContainer oldEntry; + URLMetadataRow oldEntry; try { if (exists(entry.hash())) { oldEntry = load(entry.hash(), null, 0); @@ -166,17 +168,17 @@ public final class MetadataRepository implements Iterable { return keys(true, null); } - public CloneableIterator entries() throws IOException { + public CloneableIterator entries() throws IOException { // enumerates entry elements return new kiter(); } - public CloneableIterator entries(final boolean up, final String firstHash) throws IOException { + public CloneableIterator entries(final boolean up, final String firstHash) throws IOException { // enumerates entry elements return new kiter(up, firstHash); } - public class kiter implements CloneableIterator { + public class kiter implements CloneableIterator { // enumerates entry elements private final Iterator iter; private final boolean error; @@ -208,12 +210,12 @@ public final class MetadataRepository implements Iterable { return this.iter.hasNext(); } - public final MetadataRowContainer next() { + public final URLMetadataRow next() { Row.Entry e = null; if (this.iter == null) { return null; } if (this.iter.hasNext()) { e = this.iter.next(); } if (e == null) { return null; } - return new MetadataRowContainer(e, null, 0); + return new URLMetadataRow(e, null, 0); } public final void remove() { @@ -232,7 +234,7 @@ public final class MetadataRepository implements Iterable { final Log log = new Log("URLDBCLEANUP"); final HashSet damagedURLS = new HashSet(); try { - final Iterator eiter = entries(true, null); + final Iterator eiter = entries(true, null); int iteratorCount = 0; while (eiter.hasNext()) try { eiter.next(); @@ -325,7 +327,7 @@ public final class MetadataRepository implements Iterable { public void run() { try { Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); - final Iterator eiter = entries(true, null); + final Iterator eiter = entries(true, null); while (eiter.hasNext() && run) { synchronized (this) { if (this.pause) { @@ -338,13 +340,13 @@ public final class MetadataRepository implements Iterable { } } } - final MetadataRowContainer entry = eiter.next(); + final URLMetadataRow entry = eiter.next(); if (entry == null) { if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null"); } else if (entry.hash() == null) { if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + "hash == null"); } else { - final URLMetadata metadata = entry.metadata(); + final URLMetadataRow.Components metadata = entry.metadata(); totalSearchedUrls++; if (metadata.url() == null) { if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null"); @@ -468,9 +470,9 @@ public final class MetadataRepository implements Iterable { count++; } } else { - final Iterator i = entries(); // iterates indexURLEntry objects - MetadataRowContainer entry; - URLMetadata metadata; + final Iterator i = entries(); // iterates indexURLEntry objects + URLMetadataRow entry; + URLMetadataRow.Components metadata; String url; while (i.hasNext()) { entry = i.next(); @@ -552,7 +554,7 @@ public final class MetadataRepository implements Iterable { HashMap map = domainSampleCollector(); // fetch urls from the database to determine the host in clear text - MetadataRowContainer urlref; + URLMetadataRow urlref; if (count < 0 || count > map.size()) count = map.size(); statsDump = new ArrayList(); TreeSet set = new TreeSet(); @@ -582,12 +584,12 @@ public final class MetadataRepository implements Iterable { // fetch urls from the database to determine the host in clear text Iterator j = s.scores(false); // iterate urlhash-examples in reverse order (biggest first) - MetadataRowContainer urlref; + URLMetadataRow urlref; String urlhash; count += 10; // make some more to prevent that we have to do this again after deletions too soon. if (count < 0 || count > s.size()) count = s.size(); statsDump = new ArrayList(); - URLMetadata comps; + URLMetadataRow.Components comps; yacyURL url; while (j.hasNext()) { urlhash = j.next(); diff --git a/source/de/anomic/kelondro/text/Reference.java b/source/de/anomic/kelondro/text/Reference.java index bd8b3386e..e9540a9f2 100644 --- a/source/de/anomic/kelondro/text/Reference.java +++ b/source/de/anomic/kelondro/text/Reference.java @@ -30,16 +30,6 @@ import de.anomic.kelondro.order.Bitfield; public interface Reference { - // appearance flags, used in RWI entry - // some names are derived from the Dublin Core Metadata tag set - // the flags 0..23 are identical to the category flags in plasmaCondenser - public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link - public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part - public static final int flag_app_dc_creator = 26; // word appears in author - public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part - public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier - public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) - public String toPropertyForm(); public String urlHash(); diff --git a/source/de/anomic/kelondro/text/ReferenceContainer.java b/source/de/anomic/kelondro/text/ReferenceContainer.java index db31052d7..637599bac 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainer.java +++ b/source/de/anomic/kelondro/text/ReferenceContainer.java @@ -37,57 +37,55 @@ import java.util.TreeMap; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.order.Base64Order; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; import de.anomic.kelondro.util.ByteBuffer; /** - * A ReferenceContainer is a set of ReferenceRows entries. Since ReferenceRow entries are special - * Row entries, a collection of ReferenceRows can be contained in a RowSet. This class extends - * the RowSet with methods for the handling of special ReferenceRow Row entry objects. + * A ReferenceContainer is a set of ReferenceRows entries for a specific term. + * Since ReferenceRow entries are special Row entries, a collection of ReferenceRows + * can be contained in a RowSet. + * This class extends the RowSet with methods for the handling of + * special ReferenceRow Row entry objects. */ public class ReferenceContainer extends RowSet { - private String wordHash; + private String termHash; - public ReferenceContainer(final String wordHash, final RowSet collection) { + public ReferenceContainer(final String termHash, final RowSet collection) { super(collection); - this.wordHash = wordHash; + this.termHash = termHash; } - public ReferenceContainer(String wordHash, ReferenceRow entry) { - super(ReferenceRow.urlEntryRow, 1); - this.add(entry); - this.wordHash = wordHash; - } - - public ReferenceContainer(final String wordHash, final Row rowdef, final int objectCount) { + public ReferenceContainer(final String termHash, final Row rowdef, final int objectCount) { super(rowdef, objectCount); - this.wordHash = wordHash; + this.termHash = termHash; this.lastTimeWrote = 0; } public ReferenceContainer topLevelClone() { - final ReferenceContainer newContainer = new ReferenceContainer(this.wordHash, this.rowdef, this.size()); + final ReferenceContainer newContainer = new ReferenceContainer(this.termHash, this.rowdef, this.size()); newContainer.addAllUnique(this); return newContainer; } public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) { - return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount); + return new ReferenceContainer(wordHash, WordReferenceRow.urlEntryRow, elementCount); } public void setWordHash(final String newWordHash) { - this.wordHash = newWordHash; + this.termHash = newWordHash; } public long updated() { return super.lastWrote(); } - public String getWordHash() { - return wordHash; + public String getTermHash() { + return termHash; } - public void add(final ReferenceRow entry) { + public void add(final WordReferenceRow entry) { // add without double-occurrence test assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize; this.addUnique(entry.toKelondroEntry()); @@ -95,11 +93,11 @@ public class ReferenceContainer extends RowSet { public void add(final Reference entry, final long updateTime) { // add without double-occurrence test - if (entry instanceof ReferenceRow) { - assert ((ReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize; - this.add((ReferenceRow) entry); + if (entry instanceof WordReferenceRow) { + assert ((WordReferenceRow) entry).toKelondroEntry().objectsize() == super.rowdef.objectsize; + this.add((WordReferenceRow) entry); } else { - this.add(((ReferenceVars) entry).toRowEntry()); + this.add(((WordReferenceVars) entry).toRowEntry()); } this.lastTimeWrote = updateTime; } @@ -120,24 +118,24 @@ public class ReferenceContainer extends RowSet { } public ReferenceContainer merge(final ReferenceContainer c) { - return new ReferenceContainer(this.wordHash, super.merge(c)); + return new ReferenceContainer(this.termHash, super.merge(c)); } - public Reference put(final ReferenceRow entry) { + public Reference put(final WordReferenceRow entry) { assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize; final Row.Entry r = super.replace(entry.toKelondroEntry()); if (r == null) return null; - return new ReferenceRow(r); + return new WordReferenceRow(r); } - public boolean putRecent(final ReferenceRow entry) { + public boolean putRecent(final WordReferenceRow entry) { assert entry.toKelondroEntry().objectsize() == super.rowdef.objectsize; // returns true if the new entry was added, false if it already existed final Row.Entry oldEntryRow = this.replace(entry.toKelondroEntry()); if (oldEntryRow == null) { return true; } - final ReferenceRow oldEntry = new ReferenceRow(oldEntryRow); + final WordReferenceRow oldEntry = new WordReferenceRow(oldEntryRow); if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container this.replace(oldEntry.toKelondroEntry()); // put it back return false; @@ -151,7 +149,7 @@ public class ReferenceContainer extends RowSet { if (c == null) return 0; int x = 0; synchronized (c) { - final Iterator i = c.entries(); + final Iterator i = c.entries(); while (i.hasNext()) { try { if (putRecent(i.next())) x++; @@ -167,7 +165,7 @@ public class ReferenceContainer extends RowSet { public Reference get(final String urlHash) { final Row.Entry entry = this.get(urlHash.getBytes()); if (entry == null) return null; - return new ReferenceRow(entry); + return new WordReferenceRow(entry); } /** @@ -178,7 +176,7 @@ public class ReferenceContainer extends RowSet { public Reference remove(final String urlHash) { final Row.Entry entry = remove(urlHash.getBytes()); if (entry == null) return null; - return new ReferenceRow(entry); + return new WordReferenceRow(entry); } public int removeEntries(final Set urlHashes) { @@ -188,12 +186,12 @@ public class ReferenceContainer extends RowSet { return count; } - public Iterator entries() { + public Iterator entries() { // returns an iterator of indexRWIEntry objects return new entryIterator(); } - public class entryIterator implements Iterator { + public class entryIterator implements Iterator { Iterator rowEntryIterator; @@ -205,10 +203,10 @@ public class ReferenceContainer extends RowSet { return rowEntryIterator.hasNext(); } - public ReferenceRow next() { + public WordReferenceRow next() { final Row.Entry rentry = rowEntryIterator.next(); if (rentry == null) return null; - return new ReferenceRow(rentry); + return new WordReferenceRow(rentry); } public void remove() { @@ -342,11 +340,11 @@ public class ReferenceContainer extends RowSet { final int keylength = small.rowdef.width(0); assert (keylength == large.rowdef.width(0)); final ReferenceContainer conj = new ReferenceContainer(null, small.rowdef, 0); // start with empty search result - final Iterator se = small.entries(); - ReferenceVars ie0; + final Iterator se = small.entries(); + WordReferenceVars ie0; Reference ie1; while (se.hasNext()) { - ie0 = new ReferenceVars(se.next()); + ie0 = new WordReferenceVars(se.next()); ie1 = large.get(ie0.urlHash()); if ((ie0 != null) && (ie1 != null)) { assert (ie0.urlHash().length() == keylength) : "ie0.urlHash() = " + ie0.urlHash(); @@ -366,13 +364,13 @@ public class ReferenceContainer extends RowSet { assert (keylength == i2.rowdef.width(0)); final ReferenceContainer conj = new ReferenceContainer(null, i1.rowdef, 0); // start with empty search result if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef.getOrdering().signature())))) return conj; // ordering must be equal - final Iterator e1 = i1.entries(); - final Iterator e2 = i2.entries(); + final Iterator e1 = i1.entries(); + final Iterator e2 = i2.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - ReferenceVars ie1; + WordReferenceVars ie1; Reference ie2; - ie1 = new ReferenceVars(e1.next()); + ie1 = new WordReferenceVars(e1.next()); ie2 = e2.next(); while (true) { @@ -381,14 +379,14 @@ public class ReferenceContainer extends RowSet { c = i1.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { - if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break; + if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break; } else if (c > 0) { if (e2.hasNext()) ie2 = e2.next(); else break; } else { // we have found the same urls in different searches! ie1.join(ie2); if (ie1.worddistance() <= maxDistance) conj.add(ie1.toRowEntry()); - if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break; + if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break; if (e2.hasNext()) ie2 = e2.next(); else break; } } @@ -420,7 +418,7 @@ public class ReferenceContainer extends RowSet { final int keylength = pivot.rowdef.width(0); assert (keylength == excl.rowdef.width(0)); final boolean iterate_pivot = pivot.size() < excl.size(); - final Iterator se = (iterate_pivot) ? pivot.entries() : excl.entries(); + final Iterator se = (iterate_pivot) ? pivot.entries() : excl.entries(); Reference ie0, ie1; while (se.hasNext()) { ie0 = se.next(); @@ -439,13 +437,13 @@ public class ReferenceContainer extends RowSet { final int keylength = pivot.rowdef.width(0); assert (keylength == excl.rowdef.width(0)); if (!((pivot.rowdef.getOrdering().signature().equals(excl.rowdef.getOrdering().signature())))) return pivot; // ordering must be equal - final Iterator e1 = pivot.entries(); - final Iterator e2 = excl.entries(); + final Iterator e1 = pivot.entries(); + final Iterator e2 = excl.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - ReferenceVars ie1; + WordReferenceVars ie1; Reference ie2; - ie1 = new ReferenceVars(e1.next()); + ie1 = new WordReferenceVars(e1.next()); ie2 = e2.next(); while (true) { @@ -454,14 +452,14 @@ public class ReferenceContainer extends RowSet { c = pivot.rowdef.getOrdering().compare(ie1.urlHash().getBytes(), ie2.urlHash().getBytes()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { - if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break; + if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break; } else if (c > 0) { if (e2.hasNext()) ie2 = e2.next(); else break; } else { // we have found the same urls in different searches! ie1.join(ie2); e1.remove(); - if (e1.hasNext()) ie1 = new ReferenceVars(e1.next()); else break; + if (e1.hasNext()) ie1 = new WordReferenceVars(e1.next()); else break; if (e2.hasNext()) ie2 = e2.next(); else break; } } @@ -470,11 +468,11 @@ public class ReferenceContainer extends RowSet { } public String toString() { - return "C[" + wordHash + "] has " + this.size() + " entries"; + return "C[" + termHash + "] has " + this.size() + " entries"; } public int hashCode() { - return (int) Base64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); + return (int) Base64Order.enhancedCoder.decodeLong(this.termHash.substring(0, 4)); } @@ -483,7 +481,7 @@ public class ReferenceContainer extends RowSet { final long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; final TreeMap doms = new TreeMap(); synchronized (inputContainer) { - final Iterator i = inputContainer.entries(); + final Iterator i = inputContainer.entries(); Reference iEntry; String dom, paths; while (i.hasNext()) { diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index c158c1fd5..9271f65f9 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -56,7 +56,7 @@ public final class ReferenceContainerArray { */ public ReferenceContainerArray( final File heapLocation, - final ByteOrder wordOrder, + final ByteOrder termOrder, final Row payloadrow, IODispatcher merger) throws IOException { this.payloadrow = payloadrow; @@ -64,7 +64,7 @@ public final class ReferenceContainerArray { heapLocation, "index", payloadrow.primaryKeyLength, - wordOrder, + termOrder, 0); assert merger != null; this.merger = merger; @@ -182,8 +182,8 @@ public final class ReferenceContainerArray { * @return true, if the key is used in the heap; false othervise * @throws IOException */ - public synchronized boolean has(final String key) { - return this.array.has(key.getBytes()); + public synchronized boolean has(final String termHash) { + return this.array.has(termHash.getBytes()); } /** @@ -192,13 +192,13 @@ public final class ReferenceContainerArray { * @return the indexContainer if one exist, null otherwise * @throws IOException */ - public synchronized ReferenceContainer get(final String key) throws IOException { - List entries = this.array.getAll(key.getBytes()); + public synchronized ReferenceContainer get(final String termHash) throws IOException { + List entries = this.array.getAll(termHash.getBytes()); if (entries == null || entries.size() == 0) return null; byte[] a = entries.remove(0); - ReferenceContainer c = new ReferenceContainer(key, RowSet.importRowSet(a, payloadrow)); + ReferenceContainer c = new ReferenceContainer(termHash, RowSet.importRowSet(a, payloadrow)); while (entries.size() > 0) { - c = c.merge(new ReferenceContainer(key, RowSet.importRowSet(entries.remove(0), payloadrow))); + c = c.merge(new ReferenceContainer(termHash, RowSet.importRowSet(entries.remove(0), payloadrow))); } return c; } @@ -209,13 +209,13 @@ public final class ReferenceContainerArray { * @return the indexContainer if the cache contained the container, null othervise * @throws IOException */ - public synchronized void delete(final String wordHash) throws IOException { + public synchronized void delete(final String termHash) throws IOException { // returns the index that had been deleted - array.remove(wordHash.getBytes()); + array.remove(termHash.getBytes()); } - public synchronized int replace(final String wordHash, ContainerRewriter rewriter) throws IOException { - return array.replace(wordHash.getBytes(), new BLOBRewriter(wordHash, rewriter)); + public synchronized int replace(final String termHash, ContainerRewriter rewriter) throws IOException { + return array.replace(termHash.getBytes(), new BLOBRewriter(termHash, rewriter)); } public class BLOBRewriter implements BLOB.Rewriter { diff --git a/source/de/anomic/kelondro/text/ReferenceContainerCache.java b/source/de/anomic/kelondro/text/ReferenceContainerCache.java index 6ae676944..4bc551f05 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerCache.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerCache.java @@ -41,6 +41,7 @@ import de.anomic.kelondro.blob.HeapWriter; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.ByteOrder; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.index.Row; @@ -49,7 +50,7 @@ import de.anomic.kelondro.index.RowSet; public final class ReferenceContainerCache extends AbstractIndex implements Index, IndexReader, Iterable { private final Row payloadrow; - private final ByteOrder wordOrder; + private final ByteOrder termOrder; private SortedMap cache; /** @@ -59,9 +60,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde * @param payloadrow * @param log */ - public ReferenceContainerCache(final Row payloadrow, ByteOrder wordOrder) { + public ReferenceContainerCache(final Row payloadrow, ByteOrder termOrder) { this.payloadrow = payloadrow; - this.wordOrder = wordOrder; + this.termOrder = termOrder; this.cache = null; } @@ -83,7 +84,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde * another dump reading afterwards is not possible */ public void initWriteMode() { - this.cache = Collections.synchronizedSortedMap(new TreeMap(new ByteOrder.StringOrder(this.wordOrder))); + this.cache = Collections.synchronizedSortedMap(new TreeMap(new ByteOrder.StringOrder(this.termOrder))); } /** @@ -94,14 +95,14 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde public void initWriteModeFromBLOB(final File blobFile) throws IOException { Log.logInfo("indexContainerRAMHeap", "restoring rwi blob dump '" + blobFile.getName() + "'"); final long start = System.currentTimeMillis(); - this.cache = Collections.synchronizedSortedMap(new TreeMap(new ByteOrder.StringOrder(this.wordOrder))); + this.cache = Collections.synchronizedSortedMap(new TreeMap(new ByteOrder.StringOrder(this.termOrder))); int urlCount = 0; synchronized (cache) { for (final ReferenceContainer container : new blobFileEntries(blobFile, this.payloadrow)) { // TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low? if (container == null) break; //System.out.println("***DEBUG indexContainerHeap.initwriteModeFromBLOB*** container.size = " + container.size() + ", container.sorted = " + container.sorted()); - cache.put(container.getWordHash(), container); + cache.put(container.getTermHash(), container); urlCount += container.size(); } } @@ -242,7 +243,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde for (ReferenceContainer container : cache.values()) { if (container.size() > max) { max = container.size(); - hash = container.getWordHash(); + hash = container.getTermHash(); } } return hash; @@ -253,7 +254,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde ArrayList hashes = new ArrayList(); for (ReferenceContainer container : cache.values()) { if (container.size() >= bound) { - hashes.add(container.getWordHash()); + hashes.add(container.getTermHash()); } } return hashes; @@ -281,7 +282,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde ArrayList hashes = new ArrayList(); long limit = System.currentTimeMillis() - maxage; for (ReferenceContainer container : cache.values()) { - if (container.lastWrote() < limit) hashes.add(container.getWordHash()); + if (container.lastWrote() < limit) hashes.add(container.getTermHash()); } return hashes; } @@ -372,9 +373,9 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde ReferenceContainer c = this.cache.get(key); if (c == null) return null; // because this is all in RAM, we must clone the entries (flat) - ReferenceContainer c1 = new ReferenceContainer(c.getWordHash(), c.row(), c.size()); - Iterator e = c.entries(); - ReferenceRow ee; + ReferenceContainer c1 = new ReferenceContainer(c.getTermHash(), c.row(), c.size()); + Iterator e = c.entries(); + WordReferenceRow ee; while (e.hasNext()) { ee = e.next(); if (urlselection.contains(ee.urlHash())) c1.add(ee); @@ -441,7 +442,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde if (this.cache == null || container == null || container.size() == 0) return; // put new words into cache - final String wordHash = container.getWordHash(); + final String wordHash = container.getTermHash(); ReferenceContainer entries = cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null int added = 0; if (entries == null) { @@ -457,7 +458,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde return; } - public synchronized void add(final String wordHash, final ReferenceRow newEntry) { + public synchronized void add(final String wordHash, final WordReferenceRow newEntry) { assert this.cache != null; ReferenceContainer container = cache.get(wordHash); if (container == null) container = new ReferenceContainer(wordHash, this.payloadrow, 1); @@ -470,7 +471,7 @@ public final class ReferenceContainerCache extends AbstractIndex implements Inde } public ByteOrder ordering() { - return this.wordOrder; + return this.termOrder; } } diff --git a/source/de/anomic/kelondro/text/ReferenceContainerOrder.java b/source/de/anomic/kelondro/text/ReferenceContainerOrder.java index 0069ecbf6..fa565c662 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerOrder.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerOrder.java @@ -38,7 +38,7 @@ public class ReferenceContainerOrder extends AbstractOrder i } public boolean wellformed(final ReferenceContainer a) { - return embeddedOrder.wellformed(a.getWordHash().getBytes()); + return embeddedOrder.wellformed(a.getTermHash().getBytes()); } public void direction(final boolean ascending) { @@ -50,15 +50,15 @@ public class ReferenceContainerOrder extends AbstractOrder i } public int compare(final ReferenceContainer a, final ReferenceContainer b) { - return this.embeddedOrder.compare(a.getWordHash().getBytes(), b.getWordHash().getBytes()); + return this.embeddedOrder.compare(a.getTermHash().getBytes(), b.getTermHash().getBytes()); } public boolean equal(ReferenceContainer a, ReferenceContainer b) { - return this.embeddedOrder.equal(a.getWordHash().getBytes(), b.getWordHash().getBytes()); + return this.embeddedOrder.equal(a.getTermHash().getBytes(), b.getTermHash().getBytes()); } public void rotate(final ReferenceContainer zero) { - this.embeddedOrder.rotate(zero.getWordHash().getBytes()); + this.embeddedOrder.rotate(zero.getTermHash().getBytes()); this.zero = new ReferenceContainer(new String(this.embeddedOrder.zero()), zero); } @@ -80,7 +80,7 @@ public class ReferenceContainerOrder extends AbstractOrder i } public long cardinal(final ReferenceContainer key) { - return this.embeddedOrder.cardinal(key.getWordHash().getBytes()); + return this.embeddedOrder.cardinal(key.getTermHash().getBytes()); } } diff --git a/source/de/anomic/kelondro/text/ReferenceOrder.java b/source/de/anomic/kelondro/text/ReferenceOrder.java index e3a108bb8..491a22a55 100644 --- a/source/de/anomic/kelondro/text/ReferenceOrder.java +++ b/source/de/anomic/kelondro/text/ReferenceOrder.java @@ -32,15 +32,17 @@ import java.util.Iterator; import java.util.Map; import de.anomic.kelondro.order.Bitfield; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; import de.anomic.kelondro.util.ScoreCluster; -import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSearchRankingProfile; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverProcessor; import de.anomic.yacy.yacyURL; public class ReferenceOrder { - private ReferenceVars min, max; + private WordReferenceVars min, max; private final plasmaSearchRankingProfile ranking; private final ScoreCluster doms; // collected for "authority" heuristic private int maxdomcount; @@ -55,10 +57,10 @@ public class ReferenceOrder { this.language = language; } - public ArrayList normalizeWith(final ReferenceContainer container) { + public ArrayList normalizeWith(final ReferenceContainer container) { // normalize ranking: find minimum and maxiumum of separate ranking criteria assert (container != null); - ArrayList result = null; + ArrayList result = null; //long s0 = System.currentTimeMillis(); if ((serverProcessor.useCPU > 1) && (container.size() > 600)) { @@ -112,7 +114,7 @@ public class ReferenceOrder { return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); } - public long cardinal(final ReferenceVars t) { + public long cardinal(final WordReferenceVars t) { //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); // the normalizedEntry must be a normalized indexEntry final Bitfield flags = t.flags(); @@ -136,17 +138,17 @@ public class ReferenceOrder { + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + tf + ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0) - + ((flags.get(Reference.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0) - + ((flags.get(Reference.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0) - + ((flags.get(Reference.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0) - + ((flags.get(Reference.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0) - + ((flags.get(Reference.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0) - + ((flags.get(Reference.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0) - + ((flags.get(plasmaCondenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0) - + ((flags.get(plasmaCondenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0) - + ((flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0) - + ((flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0) - + ((flags.get(plasmaCondenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0) + + ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0) + + ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0) + + ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0) + + ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0) + + ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0) + + ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0) + + ((flags.get(Condenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0) + + ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0) + + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0) + + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0) + + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0) + ((patchUK(t.language).equals(this.language)) ? 255 << ranking.coeff_language : 0) + ((yacyURL.probablyRootURL(t.urlHash())) ? 15 << ranking.coeff_urllength : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; @@ -161,13 +163,13 @@ public class ReferenceOrder { public static class minmaxfinder extends Thread { - ReferenceVars entryMin; - ReferenceVars entryMax; + WordReferenceVars entryMin; + WordReferenceVars entryMax; private final ReferenceContainer container; private final int start, end; private final HashMap doms; private final Integer int1; - ArrayList decodedEntries; + ArrayList decodedEntries; public minmaxfinder(final ReferenceContainer container, final int start /*including*/, final int end /*excluding*/) { this.container = container; @@ -175,19 +177,19 @@ public class ReferenceOrder { this.end = end; this.doms = new HashMap(); this.int1 = 1; - this.decodedEntries = new ArrayList(); + this.decodedEntries = new ArrayList(); } public void run() { // find min/max to obtain limits for normalization this.entryMin = null; this.entryMax = null; - ReferenceVars iEntry; + WordReferenceVars iEntry; int p = this.start; String dom; Integer count; while (p < this.end) { - iEntry = new ReferenceVars(new ReferenceRow(container.get(p++, false))); + iEntry = new WordReferenceVars(new WordReferenceRow(container.get(p++, false))); this.decodedEntries.add(iEntry); // find min/max if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry); @@ -203,7 +205,7 @@ public class ReferenceOrder { } } - public ArrayList decodedContainer() { + public ArrayList decodedContainer() { return this.decodedEntries; } diff --git a/source/de/anomic/kelondro/text/URLMetadata.java b/source/de/anomic/kelondro/text/URLMetadata.java deleted file mode 100644 index fa2df81f8..000000000 --- a/source/de/anomic/kelondro/text/URLMetadata.java +++ /dev/null @@ -1,61 +0,0 @@ -// URLMetadata.java -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 02.03.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro.text; - -import java.net.MalformedURLException; - -import de.anomic.yacy.yacyURL; - -public class URLMetadata { - private yacyURL url; - private final String dc_title, dc_creator, dc_subject, ETag; - - public URLMetadata(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) { - try { - this.url = new yacyURL(url, urlhash); - } catch (final MalformedURLException e) { - this.url = null; - } - this.dc_title = title; - this.dc_creator = author; - this.dc_subject = tags; - this.ETag = ETag; - } - public URLMetadata(final yacyURL url, final String descr, final String author, final String tags, final String ETag) { - this.url = url; - this.dc_title = descr; - this.dc_creator = author; - this.dc_subject = tags; - this.ETag = ETag; - } - public yacyURL url() { return this.url; } - public String dc_title() { return this.dc_title; } - public String dc_creator() { return this.dc_creator; } - public String dc_subject() { return this.dc_subject; } - public String ETag() { return this.ETag; } - -} diff --git a/source/de/anomic/kelondro/text/MetadataRowContainer.java b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java similarity index 87% rename from source/de/anomic/kelondro/text/MetadataRowContainer.java rename to source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java index 0e035ee03..402044f92 100644 --- a/source/de/anomic/kelondro/text/MetadataRowContainer.java +++ b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ // // LICENSE // @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.kelondro.text.metadataPrototype; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; @@ -39,6 +39,9 @@ import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.NaturalOrder; +import de.anomic.kelondro.text.Metadata; +import de.anomic.kelondro.text.Reference; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.kelondroException; @@ -48,7 +51,7 @@ import de.anomic.server.serverCodings; import de.anomic.tools.crypt; import de.anomic.yacy.yacyURL; -public class MetadataRowContainer { +public class URLMetadataRow implements Metadata { // this object stores attributes for URL entries @@ -119,7 +122,7 @@ public class MetadataRowContainer { private Reference word; // this is only used if the url is transported via remote search requests private final long ranking; // during generation of a search result this value is set - public MetadataRowContainer( + public URLMetadataRow( final yacyURL url, final String dc_title, final String dc_creator, @@ -198,14 +201,14 @@ public class MetadataRowContainer { } } - public MetadataRowContainer(final Row.Entry entry, final Reference searchedWord, final long ranking) { + public URLMetadataRow(final Row.Entry entry, final Reference searchedWord, final long ranking) { this.entry = entry; this.snippet = null; this.word = searchedWord; this.ranking = ranking; } - public MetadataRowContainer(final Properties prop) { + public URLMetadataRow(final Properties prop) { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -264,17 +267,17 @@ public class MetadataRowContainer { this.word = null; if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported"); if (prop.containsKey("wi")) { - this.word = new ReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()")); + this.word = new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""), "de.anomic.index.indexURLEntry.indexURLEntry()")); } this.ranking = 0; } - public static MetadataRowContainer importEntry(final String propStr) { + public static URLMetadataRow importEntry(final String propStr) { if (propStr == null || !propStr.startsWith("{") || !propStr.endsWith("}")) { return null; } try { - return new MetadataRowContainer(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); + return new URLMetadataRow(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); } catch (final kelondroException e) { // wrong format return null; @@ -283,7 +286,7 @@ public class MetadataRowContainer { private StringBuilder corePropList() { // generate a parseable string; this is a simple property-list - final URLMetadata metadata = this.metadata(); + final Components metadata = this.metadata(); final StringBuilder s = new StringBuilder(300); //System.out.println("author=" + comp.author()); try { @@ -341,9 +344,9 @@ public class MetadataRowContainer { return this.ranking; } - public URLMetadata metadata() { + public Components metadata() { final ArrayList cl = FileUtils.strings(this.entry.getCol("comp", null), "UTF-8"); - return new URLMetadata( + return new Components( (cl.size() > 0) ? (cl.get(0)).trim() : "", hash(), (cl.size() > 1) ? (cl.get(1)).trim() : "", @@ -428,7 +431,7 @@ public class MetadataRowContainer { return word; } - public boolean isOlder(final MetadataRowContainer other) { + public boolean isOlder(final Metadata other) { if (other == null) return false; final Date tmoddate = moddate(); final Date omoddate = other.moddate(); @@ -487,4 +490,33 @@ public class MetadataRowContainer { //return "{" + core + "}"; } + public class Components { + private yacyURL url; + private final String dc_title, dc_creator, dc_subject, ETag; + + public Components(final String url, final String urlhash, final String title, final String author, final String tags, final String ETag) { + try { + this.url = new yacyURL(url, urlhash); + } catch (final MalformedURLException e) { + this.url = null; + } + this.dc_title = title; + this.dc_creator = author; + this.dc_subject = tags; + this.ETag = ETag; + } + public Components(final yacyURL url, final String descr, final String author, final String tags, final String ETag) { + this.url = url; + this.dc_title = descr; + this.dc_creator = author; + this.dc_subject = tags; + this.ETag = ETag; + } + public yacyURL url() { return this.url; } + public String dc_title() { return this.dc_title; } + public String dc_creator() { return this.dc_creator; } + public String dc_subject() { return this.dc_subject; } + public String ETag() { return this.ETag; } + + } } diff --git a/source/de/anomic/kelondro/text/ReferenceRow.java b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java similarity index 85% rename from source/de/anomic/kelondro/text/ReferenceRow.java rename to source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java index 6d5be2679..7d6a246d9 100644 --- a/source/de/anomic/kelondro/text/ReferenceRow.java +++ b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceRow.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ // // LICENSE // @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.kelondro.text.referencePrototype; import de.anomic.kelondro.index.Column; import de.anomic.kelondro.index.Row; @@ -32,9 +32,10 @@ import de.anomic.kelondro.index.Row.Entry; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.MicroDate; +import de.anomic.kelondro.text.Reference; import de.anomic.yacy.yacySeedDB; -public final class ReferenceRow implements Reference, Cloneable { +public final class WordReferenceRow implements Reference, Cloneable { // this object stores attributes to URL references inside RWI collections @@ -88,9 +89,19 @@ public final class ReferenceRow implements Reference, Cloneable { private static final int col_reserve1 = 18; // i 1 reserve1 private static final int col_reserve2 = 19; // k 1 reserve2 + // appearance flags, used in RWI entry + // some names are derived from the Dublin Core Metadata tag set + // the flags 0..23 are identical to the category flags in plasmaCondenser + public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link + public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part + public static final int flag_app_dc_creator = 26; // word appears in author + public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part + public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier + public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) + private final Row.Entry entry; - public ReferenceRow(final String urlHash, + public WordReferenceRow(final String urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components final int titleLength, // length of description/length (longer are better?) @@ -135,32 +146,32 @@ public final class ReferenceRow implements Reference, Cloneable { this.entry.setCol(col_reserve2, 0); } - public ReferenceRow(final String urlHash, final String code) { + public WordReferenceRow(final String urlHash, final String code) { // the code is the external form of the row minus the leading urlHash entry this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); } - public ReferenceRow(final String external) { + public WordReferenceRow(final String external) { this.entry = urlEntryRow.newEntry(external, true); } - public ReferenceRow(final byte[] row) { + public WordReferenceRow(final byte[] row) { this.entry = urlEntryRow.newEntry(row); } - public ReferenceRow(final byte[] row, final int offset, final boolean clone) { + public WordReferenceRow(final byte[] row, final int offset, final boolean clone) { this.entry = urlEntryRow.newEntry(row, offset, clone); } - public ReferenceRow(final Row.Entry rentry) { + public WordReferenceRow(final Row.Entry rentry) { // FIXME: see if cloning is necessary this.entry = rentry; } - public ReferenceRow clone() { + public WordReferenceRow clone() { final byte[] b = new byte[urlEntryRow.objectsize]; System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize); - return new ReferenceRow(b); + return new WordReferenceRow(b); } public String toPropertyForm() { diff --git a/source/de/anomic/kelondro/text/ReferenceVars.java b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceVars.java similarity index 91% rename from source/de/anomic/kelondro/text/ReferenceVars.java rename to source/de/anomic/kelondro/text/referencePrototype/WordReferenceVars.java index f6c4be337..cc9039ae9 100644 --- a/source/de/anomic/kelondro/text/ReferenceVars.java +++ b/source/de/anomic/kelondro/text/referencePrototype/WordReferenceVars.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ // // LICENSE // @@ -24,12 +24,13 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.kelondro.text.referencePrototype; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.MicroDate; +import de.anomic.kelondro.text.Reference; -public class ReferenceVars implements Reference, Cloneable { +public class WordReferenceVars implements Reference, Cloneable { public Bitfield flags; public long freshUntil, lastModified; @@ -41,7 +42,7 @@ public class ReferenceVars implements Reference, Cloneable { worddistance, wordsintext, wordsintitle; public double termFrequency; - public ReferenceVars(final String urlHash, + public WordReferenceVars(final String urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components final int titleLength, // length of description/length (longer are better?) @@ -86,7 +87,7 @@ public class ReferenceVars implements Reference, Cloneable { this.termFrequency = termfrequency; } - public ReferenceVars(final ReferenceRow e) { + public WordReferenceVars(final WordReferenceRow e) { this.flags = e.flags(); this.freshUntil = e.freshUntil(); this.lastModified = e.lastModified(); @@ -109,8 +110,8 @@ public class ReferenceVars implements Reference, Cloneable { this.termFrequency = e.termFrequency(); } - public ReferenceVars clone() { - final ReferenceVars c = new ReferenceVars( + public WordReferenceVars clone() { + final WordReferenceVars c = new WordReferenceVars( this.urlHash, this.urllength, this.urlcomps, @@ -133,7 +134,7 @@ public class ReferenceVars implements Reference, Cloneable { return c; } - public void join(final ReferenceVars oe) { + public void join(final WordReferenceVars oe) { // combine the distance this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext); this.posintext = Math.min(this.posintext, oe.posintext); @@ -203,8 +204,8 @@ public class ReferenceVars implements Reference, Cloneable { return posofphrase; } - public ReferenceRow toRowEntry() { - return new ReferenceRow( + public WordReferenceRow toRowEntry() { + return new WordReferenceRow( urlHash, urllength, // byte-length of complete URL urlcomps, // number of path components @@ -262,7 +263,7 @@ public class ReferenceVars implements Reference, Cloneable { return this.termFrequency; } - public final void min(final ReferenceVars other) { + public final void min(final WordReferenceVars other) { int v; long w; double d; @@ -284,7 +285,7 @@ public class ReferenceVars implements Reference, Cloneable { if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d; } - public final void max(final ReferenceVars other) { + public final void max(final WordReferenceVars other) { int v; long w; double d; diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/parser/Condenser.java similarity index 93% rename from source/de/anomic/plasma/plasmaCondenser.java rename to source/de/anomic/plasma/parser/Condenser.java index e17a4669d..a620dcf15 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/parser/Condenser.java @@ -23,7 +23,7 @@ // compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java // execute with java -cp source de.anomic.plasma.plasmaCondenser -package de.anomic.plasma; +package de.anomic.plasma.parser; import java.io.BufferedReader; import java.io.ByteArrayInputStream; @@ -49,14 +49,13 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.Phrase; -import de.anomic.kelondro.text.Reference; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.SetTools; import de.anomic.language.identification.Identificator; +import de.anomic.plasma.plasmaParserDocument; import de.anomic.yacy.yacyURL; -public final class plasmaCondenser { +public final class Condenser { // this is the page analysis class @@ -101,7 +100,7 @@ public final class plasmaCondenser { public Bitfield RESULT_FLAGS = new Bitfield(4); Identificator languageIdentificator; - public plasmaCondenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException { + public Condenser(final plasmaParserDocument document, final boolean indexText, final boolean indexMedia) throws UnsupportedEncodingException { // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.wordminsize = 3; @@ -133,13 +132,13 @@ public final class plasmaCondenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - insertTextToWords(document.dc_title(), 1, Reference.flag_app_dc_title, RESULT_FLAGS, true); - insertTextToWords(document.dc_description(), 3, Reference.flag_app_dc_description, RESULT_FLAGS, true); - insertTextToWords(document.dc_creator(), 4, Reference.flag_app_dc_creator, RESULT_FLAGS, true); + insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true); + insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true); + insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true); // missing: tags! final String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { - insertTextToWords(titles[i], i + 10, Reference.flag_app_emphasized, RESULT_FLAGS, true); + insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true); } // anchors: for text indexing we add only the anchor description @@ -164,7 +163,7 @@ public final class plasmaCondenser { } // add the URL components to the word list - insertTextToWords(document.dc_source().toNormalform(false, true), 0, Reference.flag_app_dc_identifier, RESULT_FLAGS, false); + insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false); if (indexMedia) { // add anchor descriptions: here, we also add the url components @@ -241,11 +240,11 @@ public final class plasmaCondenser { } } - public plasmaCondenser(final InputStream text, final String charset) throws UnsupportedEncodingException { + public Condenser(final InputStream text, final String charset) throws UnsupportedEncodingException { this(text, charset, 3, 2); } - public plasmaCondenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException { + public Condenser(final InputStream text, final String charset, final int wordminsize, final int wordcut) throws UnsupportedEncodingException { this.wordminsize = wordminsize; this.wordcut = wordcut; this.languageIdentificator = null; // we don't need that here @@ -715,7 +714,7 @@ public final class plasmaCondenser { buffer = new ByteArrayInputStream(text.getBytes()); } try { - return new plasmaCondenser(buffer, "UTF-8", 2, 1).words(); + return new Condenser(buffer, "UTF-8", 2, 1).words(); } catch (final UnsupportedEncodingException e) { return null; } diff --git a/source/de/anomic/kelondro/text/Document.java b/source/de/anomic/plasma/parser/Document.java similarity index 95% rename from source/de/anomic/kelondro/text/Document.java rename to source/de/anomic/plasma/parser/Document.java index 30de51799..0584907c7 100644 --- a/source/de/anomic/kelondro/text/Document.java +++ b/source/de/anomic/plasma/parser/Document.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.plasma.parser; import java.util.Date; diff --git a/source/de/anomic/kelondro/text/Phrase.java b/source/de/anomic/plasma/parser/Phrase.java similarity index 94% rename from source/de/anomic/kelondro/text/Phrase.java rename to source/de/anomic/plasma/parser/Phrase.java index 8c91189b6..d8eaf5cd9 100644 --- a/source/de/anomic/kelondro/text/Phrase.java +++ b/source/de/anomic/plasma/parser/Phrase.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.plasma.parser; import java.util.HashSet; diff --git a/source/de/anomic/kelondro/text/Word.java b/source/de/anomic/plasma/parser/Word.java similarity index 96% rename from source/de/anomic/kelondro/text/Word.java rename to source/de/anomic/plasma/parser/Word.java index 543536f8d..7112f640f 100644 --- a/source/de/anomic/kelondro/text/Word.java +++ b/source/de/anomic/plasma/parser/Word.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.kelondro.text; +package de.anomic.plasma.parser; import java.util.HashSet; import java.util.Iterator; diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java index a02b3067a..53f142dfe 100644 --- a/source/de/anomic/plasma/plasmaDbImporter.java +++ b/source/de/anomic/plasma/plasmaDbImporter.java @@ -6,10 +6,10 @@ import java.util.TreeSet; import de.anomic.crawler.AbstractImporter; import de.anomic.crawler.Importer; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceRow; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.DateFormatter; public class plasmaDbImporter extends AbstractImporter implements Importer { @@ -109,11 +109,11 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { try { this.wordCounter++; newContainer = indexContainerIterator.next(); - this.wordHash = newContainer.getWordHash(); + this.wordHash = newContainer.getTermHash(); // loop throug the entities of the container and get the // urlhash - final Iterator importWordIdxEntries = newContainer.entries(); + final Iterator importWordIdxEntries = newContainer.entries(); Reference importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted @@ -141,7 +141,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { // we need to import the url // getting the url entry - final MetadataRowContainer urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0); + final URLMetadataRow urlEntry = this.importWordIndex.metadata().load(urlHash, null, 0); if (urlEntry != null) { /* write it into the home url db */ @@ -206,7 +206,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer { final TreeSet containers = this.importWordIndex.index().references(this.wordHash, false, 100, false); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) { + if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) { indexContainerIterator = containers.iterator(); } } diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 5ddebf492..bf182ca7d 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -48,9 +48,9 @@ import de.anomic.kelondro.blob.BLOBCompressor; import de.anomic.kelondro.blob.BLOBHeap; import de.anomic.kelondro.blob.MapView; import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; +import de.anomic.plasma.parser.Document; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 8b0dacb9c..7994859d9 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -43,6 +43,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.parser.Parser; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverCachedFileOutputStream; import de.anomic.yacy.yacyURL; @@ -282,7 +283,7 @@ dc_rights public Iterator getSentences(final boolean pre) { if (this.text == null) return null; - final plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText()); + final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText()); e.pre(pre); return e; } @@ -439,7 +440,7 @@ dc_rights this.favicon = faviconURL; } - public void notifyWebStructure(final plasmaWebStructure webStructure, final plasmaCondenser condenser, final Date docDate) { + public void notifyWebStructure(final plasmaWebStructure webStructure, final Condenser condenser, final Date docDate) { final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther] this.inboundLinks = ioLinks[0].intValue(); this.outboundLinks = ioLinks[1].intValue(); diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index 509d29fc9..9b9a0562f 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -388,7 +388,7 @@ public class plasmaRankingCRProcess { CloneableIterator cr_entry; while (i.hasNext()) { keycollection = i.next(); - referee = keycollection.getWordHash(); + referee = keycollection.getTermHash(); if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6); cr_entry = keycollection.rows(); diff --git a/source/de/anomic/plasma/plasmaSearchAPI.java b/source/de/anomic/plasma/plasmaSearchAPI.java index 03f80857c..5ad20c3b4 100644 --- a/source/de/anomic/plasma/plasmaSearchAPI.java +++ b/source/de/anomic/plasma/plasmaSearchAPI.java @@ -32,12 +32,13 @@ import java.util.Date; import java.util.Iterator; import java.util.List; +import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.Reference; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.DateFormatter; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverObjects; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacyURL; @@ -54,17 +55,17 @@ public class plasmaSearchAPI { if (post.get("flags","").length() == 0) return null; return new Bitfield(4, post.get("flags")); } - if (post.get("description", "").equals("on")) b.set(Reference.flag_app_dc_description, true); - if (post.get("title", "").equals("on")) b.set(Reference.flag_app_dc_title, true); - if (post.get("creator", "").equals("on")) b.set(Reference.flag_app_dc_creator, true); - if (post.get("subject", "").equals("on")) b.set(Reference.flag_app_dc_subject, true); - if (post.get("url", "").equals("on")) b.set(Reference.flag_app_dc_identifier, true); - if (post.get("emphasized", "").equals("on")) b.set(Reference.flag_app_emphasized, true); - if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true); - if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true); - if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true); - if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true); - if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true); + if (post.get("description", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_description, true); + if (post.get("title", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_title, true); + if (post.get("creator", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_creator, true); + if (post.get("subject", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_subject, true); + if (post.get("url", "").equals("on")) b.set(WordReferenceRow.flag_app_dc_identifier, true); + if (post.get("emphasized", "").equals("on")) b.set(WordReferenceRow.flag_app_emphasized, true); + if (post.get("image", "").equals("on")) b.set(Condenser.flag_cat_hasimage, true); + if (post.get("audio", "").equals("on")) b.set(Condenser.flag_cat_hasaudio, true); + if (post.get("video", "").equals("on")) b.set(Condenser.flag_cat_hasvideo, true); + if (post.get("app", "").equals("on")) b.set(Condenser.flag_cat_hasapp, true); + if (post.get("indexof", "").equals("on")) b.set(Condenser.flag_cat_indexof, true); return b; } @@ -96,17 +97,17 @@ public class plasmaSearchAPI { } else { prop.put("searchresult", 3); prop.put("searchresult_allurl", ranked.filteredCount()); - prop.put("searchresult_description", ranked.flagCount()[Reference.flag_app_dc_description]); - prop.put("searchresult_title", ranked.flagCount()[Reference.flag_app_dc_title]); - prop.put("searchresult_creator", ranked.flagCount()[Reference.flag_app_dc_creator]); - prop.put("searchresult_subject", ranked.flagCount()[Reference.flag_app_dc_subject]); - prop.put("searchresult_url", ranked.flagCount()[Reference.flag_app_dc_identifier]); - prop.put("searchresult_emphasized", ranked.flagCount()[Reference.flag_app_emphasized]); - prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]); - prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]); - prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]); - prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]); - prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]); + prop.put("searchresult_description", ranked.flagCount()[WordReferenceRow.flag_app_dc_description]); + prop.put("searchresult_title", ranked.flagCount()[WordReferenceRow.flag_app_dc_title]); + prop.put("searchresult_creator", ranked.flagCount()[WordReferenceRow.flag_app_dc_creator]); + prop.put("searchresult_subject", ranked.flagCount()[WordReferenceRow.flag_app_dc_subject]); + prop.put("searchresult_url", ranked.flagCount()[WordReferenceRow.flag_app_dc_identifier]); + prop.put("searchresult_emphasized", ranked.flagCount()[WordReferenceRow.flag_app_emphasized]); + prop.put("searchresult_image", ranked.flagCount()[Condenser.flag_cat_hasimage]); + prop.put("searchresult_audio", ranked.flagCount()[Condenser.flag_cat_hasaudio]); + prop.put("searchresult_video", ranked.flagCount()[Condenser.flag_cat_hasvideo]); + prop.put("searchresult_app", ranked.flagCount()[Condenser.flag_cat_hasapp]); + prop.put("searchresult_indexof", ranked.flagCount()[Condenser.flag_cat_indexof]); } return ranked; } @@ -126,7 +127,7 @@ public class plasmaSearchAPI { prop.put("genUrlList_lines", maxlines); int i = 0; yacyURL url; - MetadataRowContainer entry; + URLMetadataRow entry; String us; long rn = -1; while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { @@ -161,17 +162,17 @@ public class plasmaSearchAPI { prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps()); prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength()); prop.put("genUrlList_urlList_"+i+"_urlExists_props", - ((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + - ((entry.word().flags().get(Reference.flag_app_dc_identifier)) ? "appears in url, " : "") + - ((entry.word().flags().get(Reference.flag_app_dc_title)) ? "appears in title, " : "") + - ((entry.word().flags().get(Reference.flag_app_dc_creator)) ? "appears in author, " : "") + - ((entry.word().flags().get(Reference.flag_app_dc_subject)) ? "appears in subject, " : "") + - ((entry.word().flags().get(Reference.flag_app_dc_description)) ? "appears in description, " : "") + - ((entry.word().flags().get(Reference.flag_app_emphasized)) ? "appears emphasized, " : "") + + ((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") + + ((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") + + ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") + ((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "") ); if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) { diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index ca4f10ba5..dac2539cc 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -39,17 +39,17 @@ import java.util.concurrent.ConcurrentHashMap; import de.anomic.crawler.ResultURLs; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceVars; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.SetTools; import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.SortStore; import de.anomic.kelondro.util.Log; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.server.serverProfiling; import de.anomic.yacy.yacySearch; @@ -180,7 +180,7 @@ public final class plasmaSearchEvent { for (Map.Entry entry : this.rankedCache.searchContainerMaps()[0].entrySet()) { wordhash = entry.getKey(); final ReferenceContainer container = entry.getValue(); - assert (container.getWordHash().equals(wordhash)); + assert (container.getTermHash().equals(wordhash)); if (container.size() > maxcount) { IAmaxcounthash = wordhash; maxcount = container.size(); @@ -264,7 +264,7 @@ public final class plasmaSearchEvent { } } - ResultEntry obtainResultEntry(final MetadataRowContainer page, final int snippetFetchMode) { + ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) { // a search result entry needs some work to produce a result Entry: // - check if url entry exists in LURL-db @@ -280,7 +280,7 @@ public final class plasmaSearchEvent { // find the url entry long startTime = System.currentTimeMillis(); - final URLMetadata metadata = page.metadata(); + final URLMetadataRow.Components metadata = page.metadata(); final String pagetitle = metadata.dc_title().toLowerCase(); if (metadata.url() == null) { registerFailure(page.hash(), "url corrupted (null)"); @@ -304,7 +304,7 @@ public final class plasmaSearchEvent { // check constraints if ((query.constraint != null) && - (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && + (query.constraint.get(Condenser.flag_cat_indexof)) && (!(metadata.dc_title().startsWith("Index of")))) { final Iterator wi = query.queryHashes.iterator(); while (wi.hasNext()) try { wordIndex.index().remove(wi.next(), page.hash()); } catch (IOException e) {} @@ -337,7 +337,7 @@ public final class plasmaSearchEvent { if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) { // attach text snippet startTime = System.currentTimeMillis(); - final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); + final plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); @@ -512,7 +512,7 @@ public final class plasmaSearchEvent { public void run() { // start fetching urls and snippets - MetadataRowContainer page; + URLMetadataRow page; final int fetchAhead = snippetMode == 0 ? 0 : 10; while (System.currentTimeMillis() < this.timeout) { this.lastLifeSign = System.currentTimeMillis(); @@ -803,8 +803,8 @@ public final class plasmaSearchEvent { public static class ResultEntry { // payload objects - private final MetadataRowContainer urlentry; - private final URLMetadata urlcomps; // buffer for components + private final URLMetadataRow urlentry; + private final URLMetadataRow.Components urlcomps; // buffer for components private String alternative_urlstring; private String alternative_urlname; private final plasmaSnippetCache.TextSnippet textSnippet; @@ -813,7 +813,7 @@ public final class plasmaSearchEvent { // statistic objects public long dbRetrievalTime, snippetComputationTime; - public ResultEntry(final MetadataRowContainer urlentry, final plasmaWordIndex wordIndex, + public ResultEntry(final URLMetadataRow urlentry, final plasmaWordIndex wordIndex, final plasmaSnippetCache.TextSnippet textSnippet, final ArrayList mediaSnippets, final long dbRetrievalTime, final long snippetComputationTime) { @@ -837,7 +837,7 @@ public final class plasmaSearchEvent { // seed is not known from here try { wordIndex.index().remove( - Word.words2hashes(plasmaCondenser.getWords( + Word.words2hashes(Condenser.getWords( ("yacyshare " + filename.replace('?', ' ') + " " + @@ -899,10 +899,10 @@ public final class plasmaSearchEvent { public int lapp() { return urlentry.lapp(); } - public ReferenceVars word() { + public WordReferenceVars word() { final Reference word = urlentry.word(); - assert word instanceof ReferenceVars; - return (ReferenceVars) word; + assert word instanceof WordReferenceVars; + return (WordReferenceVars) word; } public boolean hasTextSnippet() { return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11); diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index c0eebde90..3da034649 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -31,8 +31,9 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.kelondro.text.Word; import de.anomic.kelondro.util.SetTools; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -234,7 +235,7 @@ public final class plasmaSearchQuery { public static final boolean matches(final String text, final TreeSet keyhashes) { // returns true if any of the word hashes in keyhashes appear in the String text // to do this, all words in the string must be recognized and transcoded to word hashes - final TreeSet wordhashes = Word.words2hashes(plasmaCondenser.getWords(text).keySet()); + final TreeSet wordhashes = Word.words2hashes(Condenser.getWords(text).keySet()); return SetTools.anymatch(wordhashes, keyhashes); } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 3b5a3096e..81eb6b494 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -39,16 +39,16 @@ import java.util.concurrent.ConcurrentHashMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.index.BinSearch; import de.anomic.kelondro.order.Digest; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceOrder; -import de.anomic.kelondro.text.ReferenceVars; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.FileUtils; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverProfiling; import de.anomic.yacy.yacyURL; @@ -59,8 +59,8 @@ public final class plasmaSearchRankingProcess { private static boolean useYBR = true; private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000; - private final SortStack stack; - private final HashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack + private final SortStack stack; + private final HashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack private final HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private final plasmaSearchQuery query; private final int maxentries; @@ -83,8 +83,8 @@ public final class plasmaSearchRankingProcess { // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking this.localSearchContainerMaps = null; - this.stack = new SortStack(maxentries); - this.doubleDomCache = new HashMap>(); + this.stack = new SortStack(maxentries); + this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang); this.query = query; @@ -103,7 +103,7 @@ public final class plasmaSearchRankingProcess { for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} } - public long ranking(final ReferenceVars word) { + public long ranking(final WordReferenceVars word) { return order.cardinal(word); } @@ -148,13 +148,13 @@ public final class plasmaSearchRankingProcess { long timer = System.currentTimeMillis(); // normalize entries - final ArrayList decodedEntries = this.order.normalizeWith(index); + final ArrayList decodedEntries = this.order.normalizeWith(index); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false); // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - final Iterator i = decodedEntries.iterator(); - ReferenceVars iEntry; + final Iterator i = decodedEntries.iterator(); + WordReferenceVars iEntry; Long r; while (i.hasNext()) { iEntry = i.next(); @@ -175,10 +175,10 @@ public final class plasmaSearchRankingProcess { // check document domain if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { - if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue; - if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue; - if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; - if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue; + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue; } // check tld domain @@ -252,10 +252,10 @@ public final class plasmaSearchRankingProcess { // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name - private SortStack.stackElement bestRWI(final boolean skipDoubleDom) { + private SortStack.stackElement bestRWI(final boolean skipDoubleDom) { // returns from the current RWI list the best entry and removes this entry from the list - SortStack m; - SortStack.stackElement rwi; + SortStack m; + SortStack.stackElement rwi; while (stack.size() > 0) { rwi = stack.pop(); if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it @@ -265,7 +265,7 @@ public final class plasmaSearchRankingProcess { m = this.doubleDomCache.get(domhash); if (m == null) { // first appearance of dom - m = new SortStack((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); + m = new SortStack((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); this.doubleDomCache.put(domhash, m); return rwi; } @@ -274,9 +274,9 @@ public final class plasmaSearchRankingProcess { } // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches - final Iterator> i = this.doubleDomCache.values().iterator(); - SortStack.stackElement bestEntry = null; - SortStack.stackElement o; + final Iterator> i = this.doubleDomCache.values().iterator(); + SortStack.stackElement bestEntry = null; + SortStack.stackElement o; while (i.hasNext()) { m = i.next(); if (m == null) continue; @@ -298,15 +298,15 @@ public final class plasmaSearchRankingProcess { return bestEntry; } - public MetadataRowContainer bestURL(final boolean skipDoubleDom) { + public URLMetadataRow bestURL(final boolean skipDoubleDom) { // returns from the current RWI list the best URL entry and removed this entry from the list while ((stack.size() > 0) || (size() > 0)) { if (((stack.size() == 0) && (size() == 0))) break; - final SortStack.stackElement obrwi = bestRWI(skipDoubleDom); + final SortStack.stackElement obrwi = bestRWI(skipDoubleDom); if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause? - final MetadataRowContainer u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); + final URLMetadataRow u = wordIndex.metadata().load(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); if (u != null) { - final URLMetadata metadata = u.metadata(); + final URLMetadataRow.Components metadata = u.metadata(); if (metadata.url() != null) this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url return u; } @@ -318,7 +318,7 @@ public final class plasmaSearchRankingProcess { public int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); int c = stack.size(); - final Iterator> i = this.doubleDomCache.values().iterator(); + final Iterator> i = this.doubleDomCache.values().iterator(); while (i.hasNext()) c += i.next().size(); return c; } @@ -355,7 +355,7 @@ public final class plasmaSearchRankingProcess { } public Reference remove(final String urlHash) { - final SortStack.stackElement se = stack.remove(urlHash.hashCode()); + final SortStack.stackElement se = stack.remove(urlHash.hashCode()); if (se == null) return null; urlhashes.remove(urlHash); return se.element; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 00d1fe464..28784c629 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -43,13 +43,14 @@ import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.httpClient; import de.anomic.http.httpResponseHeader; -import de.anomic.kelondro.text.Document; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.SetTools; import de.anomic.kelondro.util.Log; +import de.anomic.plasma.parser.Document; import de.anomic.plasma.parser.ParserException; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacyURL; @@ -302,7 +303,7 @@ public class plasmaSnippetCache { } @SuppressWarnings("unchecked") - public static TextSnippet retrieveTextSnippet(final URLMetadata comp, final Set queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) { + public static TextSnippet retrieveTextSnippet(final URLMetadataRow.Components comp, final Set queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int timeout, final int maxDocLen, final boolean reindexing) { // heise = "0OQUNU3JSs05" final yacyURL url = comp.url(); if (queryhashes.size() == 0) { @@ -796,7 +797,7 @@ public class plasmaSnippetCache { private static HashMap hashSentence(final String sentence) { // generates a word-wordPos mapping final HashMap map = new HashMap(); - final Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8"); + final Enumeration words = Condenser.wordTokenizer(sentence, "UTF-8"); int pos = 0; StringBuilder word; String hash; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 948e399a9..b3fc55476 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -122,6 +122,7 @@ import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; +import de.anomic.data.Blacklist; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; import de.anomic.data.blogBoardComments; @@ -139,18 +140,17 @@ import de.anomic.http.httpd; import de.anomic.http.httpdRobotsTxtConfig; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.kelondro.text.Document; -import de.anomic.kelondro.text.MetadataRowContainer; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Blacklist; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.SetTools; import de.anomic.net.UPnP; +import de.anomic.plasma.parser.Document; import de.anomic.plasma.parser.ParserException; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverBusyThread; import de.anomic.server.serverCore; @@ -930,7 +930,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch words = null; try { - words = new plasmaCondenser(document, true, true).words().keySet(); + words = new Condenser(document, true, true).words().keySet(); } catch (final UnsupportedEncodingException e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/plasmaSwitchboardConstants.java b/source/de/anomic/plasma/plasmaSwitchboardConstants.java index 97c4e3207..322cf5413 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardConstants.java +++ b/source/de/anomic/plasma/plasmaSwitchboardConstants.java @@ -317,7 +317,7 @@ public final class plasmaSwitchboardConstants { * * @see DefaultBlacklist for a detailed overview about the syntax of the default implementation */ - public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.kelondro.text.DefaultBlacklist"; + public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.data.DefaultBlacklist"; public static final String LIST_BLUE = "plasmaBlueList"; public static final String LIST_BLUE_DEFAULT = null; public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords"; diff --git a/source/de/anomic/plasma/plasmaWebStructure.java b/source/de/anomic/plasma/plasmaWebStructure.java index 9c17aefad..4af310904 100644 --- a/source/de/anomic/plasma/plasmaWebStructure.java +++ b/source/de/anomic/plasma/plasmaWebStructure.java @@ -42,6 +42,7 @@ import de.anomic.kelondro.order.MicroDate; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; +import de.anomic.plasma.parser.Condenser; import de.anomic.yacy.yacyURL; public class plasmaWebStructure { @@ -90,7 +91,7 @@ public class plasmaWebStructure { } } - public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final plasmaCondenser condenser, final Date docDate) { + public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final plasmaParserDocument document, final Condenser condenser, final Date docDate) { final yacyURL url = document.dc_source(); // generate citation reference diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 52e552c08..5c5e460c2 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -38,6 +38,7 @@ import java.util.TreeSet; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.IndexingStack; +import de.anomic.data.Blacklist; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpdProxyCacheEntry; import de.anomic.kelondro.blob.BLOBArray; @@ -47,16 +48,16 @@ import de.anomic.kelondro.text.BufferedIndex; import de.anomic.kelondro.text.BufferedIndexCollection; import de.anomic.kelondro.text.IndexCell; import de.anomic.kelondro.text.IndexCollectionMigration; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.IODispatcher; -import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.MetadataRepository; -import de.anomic.kelondro.text.Word; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.kelondroException; import de.anomic.kelondro.util.Log; +import de.anomic.plasma.parser.Word; +import de.anomic.plasma.parser.Condenser; import de.anomic.tools.iso639; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; @@ -146,7 +147,7 @@ public final class plasmaWordIndex { new IndexCollectionMigration( indexPrimaryTextLocation, wordOrder, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, entityCacheMaxSize, targetFileSize, maxFileSize, @@ -156,7 +157,7 @@ public final class plasmaWordIndex { new BufferedIndexCollection( indexPrimaryTextLocation, wordOrder, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, entityCacheMaxSize, useCommons, redundancy, @@ -167,7 +168,7 @@ public final class plasmaWordIndex { this.index = new IndexCell( new File(indexPrimaryTextLocation, "RICELL"), wordOrder, - ReferenceRow.urlEntryRow, + WordReferenceRow.urlEntryRow, entityCacheMaxSize, targetFileSize, maxFileSize, @@ -408,7 +409,7 @@ public final class plasmaWordIndex { * @param outlinksOther * @return */ - public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final plasmaCondenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) { + public int addPageIndex(final yacyURL url, final Date urlModified, final plasmaParserDocument document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) { int wordCount = 0; final int urlLength = url.toNormalform(true, true).length(); final int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length; @@ -417,14 +418,14 @@ public final class plasmaWordIndex { final Iterator> i = condenser.words().entrySet().iterator(); Map.Entry wentry; String word; - ReferenceRow ientry; + WordReferenceRow ientry; Word wprop; while (i.hasNext()) { wentry = i.next(); word = wentry.getKey(); wprop = wentry.getValue(); assert (wprop.flags != null); - ientry = new ReferenceRow(url.hash(), + ientry = new WordReferenceRow(url.hash(), urlLength, urlComps, (document == null) ? urlLength : document.dc_title().length(), wprop.count, condenser.RESULT_NUMB_WORDS, @@ -458,7 +459,7 @@ public final class plasmaWordIndex { queuePreStack.close(); } - public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException { + public URLMetadataRow storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final Condenser condenser) throws IOException { final long startTime = System.currentTimeMillis(); // CREATE INDEX @@ -511,7 +512,7 @@ public final class plasmaWordIndex { // create a new loaded URL db entry final long ldate = System.currentTimeMillis(); - final MetadataRowContainer newEntry = new MetadataRowContainer( + final URLMetadataRow newEntry = new URLMetadataRow( entry.url(), // URL dc_title, // document description document.dc_creator(), // author @@ -649,7 +650,7 @@ public final class plasmaWordIndex { public void run() { Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); ReferenceContainer container = null; - ReferenceRow entry = null; + WordReferenceRow entry = null; yacyURL url = null; final HashSet urlHashs = new HashSet(); try { @@ -657,14 +658,14 @@ public final class plasmaWordIndex { while (indexContainerIterator.hasNext() && run) { waiter(); container = indexContainerIterator.next(); - final Iterator containerIterator = container.entries(); - wordHashNow = container.getWordHash(); + final Iterator containerIterator = container.entries(); + wordHashNow = container.getTermHash(); while (containerIterator.hasNext() && run) { waiter(); entry = containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); - final MetadataRowContainer ue = metadata.load(entry.urlHash(), entry, 0); + final URLMetadataRow ue = metadata.load(entry.urlHash(), entry, 0); if (ue == null) { urlHashs.add(entry.urlHash()); } else { @@ -675,9 +676,9 @@ public final class plasmaWordIndex { } } if (urlHashs.size() > 0) try { - final int removed = index.remove(container.getWordHash(), urlHashs); - Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted"); - lastWordHash = container.getWordHash(); + final int removed = index.remove(container.getTermHash(), urlHashs); + Log.logFine("INDEXCLEANER", container.getTermHash() + ": " + removed + " of " + container.size() + " URL-entries deleted"); + lastWordHash = container.getTermHash(); lastDeletionCounter = urlHashs.size(); urlHashs.clear(); } catch (IOException e) { @@ -686,10 +687,10 @@ public final class plasmaWordIndex { if (!containerIterator.hasNext()) { // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet containers = index.references(container.getWordHash(), false, 100, false); + final TreeSet containers = index.references(container.getTermHash(), false, 100, false); indexContainerIterator = containers.iterator(); // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) { + if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) { indexContainerIterator = containers.iterator(); } } diff --git a/source/de/anomic/yacy/dht/Dispatcher.java b/source/de/anomic/yacy/dht/Dispatcher.java index 3c2fcacc8..98d2d41cb 100755 --- a/source/de/anomic/yacy/dht/Dispatcher.java +++ b/source/de/anomic/yacy/dht/Dispatcher.java @@ -35,8 +35,8 @@ import java.util.Map; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.text.BufferedIndex; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.MetadataRepository; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.Log; import de.anomic.server.serverProcessor; import de.anomic.yacy.yacySeed; @@ -181,7 +181,7 @@ public class Dispatcher { (System.currentTimeMillis() < timeout) && ((container = indexContainerIterator.next()) != null) && ((containers.size() == 0) || - (Base64Order.enhancedComparator.compare(container.getWordHash(), limitHash) < 0)) + (Base64Order.enhancedComparator.compare(container.getTermHash(), limitHash) < 0)) ) { if (container.size() == 0) continue; @@ -190,15 +190,15 @@ public class Dispatcher { } // then remove the container from the backend HashSet urlHashes = new HashSet(); - Iterator it; + Iterator it; for (ReferenceContainer c: containers) { urlHashes.clear(); it = c.entries(); while (it.hasNext()) { urlHashes.add(it.next().urlHash()); } - if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getWordHash() + "'"); - if (urlHashes.size() > 0) this.backend.remove(c.getWordHash(), urlHashes); + if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHash() + "'"); + if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes); } // finished. The caller must take care of the containers and must put them back if not needed @@ -222,15 +222,15 @@ public class Dispatcher { // check all entries and split them to the partitions ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount]; - ReferenceRow re; + WordReferenceRow re; for (ReferenceContainer container: containers) { // init the new partitions for (int j = 0; j < partitionBuffer.length; j++) { - partitionBuffer[j] = new ReferenceContainer(container.getWordHash(), container.row(), container.size() / partitionCount); + partitionBuffer[j] = new ReferenceContainer(container.getTermHash(), container.row(), container.size() / partitionCount); } // split the container - Iterator i = container.entries(); + Iterator i = container.entries(); while (i.hasNext()) { re = i.next(); if (re == null) continue; @@ -263,7 +263,7 @@ public class Dispatcher { for (int vertical = 0; vertical < containers.length; vertical++) { // the 'new' primary target is the word hash of the last container lastContainer = containers[vertical].get(containers[vertical].size() - 1); - primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getWordHash(), vertical)); + primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getTermHash(), vertical)); // get or make a entry object entry = this.transmissionCloud.get(primaryTarget); // if this is not null, the entry is extended here diff --git a/source/de/anomic/yacy/dht/Transmission.java b/source/de/anomic/yacy/dht/Transmission.java index 40c7745fd..d3bd203b7 100644 --- a/source/de/anomic/yacy/dht/Transmission.java +++ b/source/de/anomic/yacy/dht/Transmission.java @@ -32,11 +32,11 @@ import java.util.Iterator; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.text.Index; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerCache; -import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.MetadataRepository; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverProcessorJob; @@ -88,7 +88,7 @@ public class Transmission { */ private String primaryTarget; private ReferenceContainerCache containers; - private HashMap references; + private HashMap references; private HashSet badReferences; private ArrayList targets; private int hit, miss; @@ -109,7 +109,7 @@ public class Transmission { this.primaryTarget = primaryTarget; this.containers = new ReferenceContainerCache(payloadrow, plasmaWordIndex.wordOrder); this.containers.initWriteMode(); - this.references = new HashMap(); + this.references = new HashMap(); this.badReferences = new HashSet(); this.targets = targets; this.hit = 0; @@ -123,12 +123,12 @@ public class Transmission { */ public void add(ReferenceContainer container) { // iterate through the entries in the container and check if the reference is in the repository - Iterator i = container.entries(); + Iterator i = container.entries(); ArrayList notFound = new ArrayList(); while (i.hasNext()) { - ReferenceRow e = i.next(); + WordReferenceRow e = i.next(); if (references.containsKey(e.urlHash()) || badReferences.contains(e.urlHash())) continue; - MetadataRowContainer r = repository.load(e.urlHash(), null, 0); + URLMetadataRow r = repository.load(e.urlHash(), null, 0); if (r == null) { notFound.add(e.urlHash()); badReferences.add(e.urlHash()); @@ -204,7 +204,7 @@ public class Transmission { Iterator i = this.containers.iterator(); ReferenceContainer firstContainer = (i == null) ? null : i.next(); log.logInfo("Index transfer of " + this.containers.size() + - " words [" + ((firstContainer == null) ? null : firstContainer.getWordHash()) + " .. " + this.primaryTarget + "]" + + " words [" + ((firstContainer == null) ? null : firstContainer.getTermHash()) + " .. " + this.primaryTarget + "]" + " and " + this.references.size() + " URLs" + " to peer " + target.getName() + ":" + target.hash + " in " + (transferTime / 1000) + diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 227773cc7..37656c770 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -60,6 +60,7 @@ import org.apache.commons.httpclient.methods.multipart.Part; import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ResultURLs; +import de.anomic.data.Blacklist; import de.anomic.http.DefaultCharsetFilePart; import de.anomic.http.DefaultCharsetStringPart; import de.anomic.http.httpClient; @@ -69,14 +70,11 @@ import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.Digest; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerCache; -import de.anomic.kelondro.text.ReferenceRow; -import de.anomic.kelondro.text.URLMetadata; -import de.anomic.kelondro.text.Word; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSearchRankingProcess; @@ -85,6 +83,7 @@ import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.plasma.plasmaWordIndex; +import de.anomic.plasma.parser.Word; import de.anomic.server.serverCore; import de.anomic.server.serverDomains; import de.anomic.tools.crypt; @@ -533,15 +532,15 @@ public final class yacyClient { } // insert results to containers - MetadataRowContainer urlEntry; + URLMetadataRow urlEntry; final String[] urls = new String[results]; for (int n = 0; n < results; n++) { // get one single search result - urlEntry = MetadataRowContainer.importEntry(result.get("resource" + n)); + urlEntry = URLMetadataRow.importEntry(result.get("resource" + n)); if (urlEntry == null) continue; assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash(); if (urlEntry.hash().length() != 12) continue; // bad url hash - final URLMetadata metadata = urlEntry.metadata(); + final URLMetadataRow.Components metadata = urlEntry.metadata(); if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) { yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); continue; // block with backlist @@ -796,7 +795,7 @@ public final class yacyClient { return "wrong protocol: " + protocol; } - public static HashMap crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final MetadataRowContainer entry, final String wordhashes) { + public static HashMap crawlReceipt(final yacySeed mySeed, final yacySeed target, final String process, final String result, final String reason, final URLMetadataRow entry, final String wordhashes) { assert (target != null); assert (mySeed != null); assert (mySeed != target); @@ -859,7 +858,7 @@ public final class yacyClient { public static String transferIndex( final yacySeed targetSeed, final ReferenceContainerCache indexes, - final HashMap urlCache, + final HashMap urlCache, final boolean gzipBody, final int timeout) { @@ -868,7 +867,7 @@ public final class yacyClient { try { // check if we got all necessary urls in the urlCache (only for debugging) - Iterator eenum; + Iterator eenum; Reference entry; for (ReferenceContainer ic: indexes) { eenum = ic.entries(); @@ -911,7 +910,7 @@ public final class yacyClient { if (uhs.length == 0) { return null; } // all url's known // extract the urlCache from the result - final MetadataRowContainer[] urls = new MetadataRowContainer[uhs.length]; + final URLMetadataRow[] urls = new URLMetadataRow[uhs.length]; for (int i = 0; i < uhs.length; i++) { urls[i] = urlCache.get(uhs[i]); if (urls[i] == null) { @@ -963,13 +962,13 @@ public final class yacyClient { int indexcount = 0; final StringBuilder entrypost = new StringBuilder(indexes.size() * 73); - Iterator eenum; + Iterator eenum; Reference entry; for (ReferenceContainer ic: indexes) { eenum = ic.entries(); while (eenum.hasNext()) { entry = eenum.next(); - entrypost.append(ic.getWordHash()) + entrypost.append(ic.getTermHash()) .append(entry.toPropertyForm()) .append(serverCore.CRLF_STRING); indexcount++; @@ -1001,7 +1000,7 @@ public final class yacyClient { } } - private static HashMap transferURL(final yacySeed targetSeed, final MetadataRowContainer[] urls, boolean gzipBody, final int timeout) { + private static HashMap transferURL(final yacySeed targetSeed, final URLMetadataRow[] urls, boolean gzipBody, final int timeout) { // this post a message to the remote message board final String address = targetSeed.getPublicAddress(); if (address == null) { return null; } diff --git a/source/de/anomic/yacy/yacyNewsPool.java b/source/de/anomic/yacy/yacyNewsPool.java index e5e4c5b6b..a5bcbd1ba 100644 --- a/source/de/anomic/yacy/yacyNewsPool.java +++ b/source/de/anomic/yacy/yacyNewsPool.java @@ -50,7 +50,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import de.anomic.kelondro.text.Blacklist; +import de.anomic.data.Blacklist; import de.anomic.plasma.plasmaSwitchboard; public class yacyNewsPool { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index fa2850444..87879e749 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -51,8 +51,8 @@ import java.util.Set; import java.util.TreeMap; import de.anomic.crawler.ResultURLs; +import de.anomic.data.Blacklist; import de.anomic.kelondro.order.Bitfield; -import de.anomic.kelondro.text.Blacklist; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSearchQuery; diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index 0861c7c31..0eb11ff6f 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -57,9 +57,9 @@ import java.util.TreeMap; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Digest; -import de.anomic.kelondro.text.Word; import de.anomic.kelondro.util.DateFormatter; import de.anomic.net.natLib; +import de.anomic.plasma.parser.Word; import de.anomic.server.serverCodings; import de.anomic.server.serverDomains; import de.anomic.server.serverSystem; diff --git a/source/de/anomic/ymage/ymageOSM.java b/source/de/anomic/ymage/ymageOSM.java index 04ae2c48c..2fbb2d2cc 100644 --- a/source/de/anomic/ymage/ymageOSM.java +++ b/source/de/anomic/ymage/ymageOSM.java @@ -35,10 +35,10 @@ import java.net.MalformedURLException; import javax.imageio.ImageIO; -import de.anomic.kelondro.text.Document; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.parser.Document; import de.anomic.yacy.yacyURL; public class ymageOSM { diff --git a/source/yacy.java b/source/yacy.java index 898b1a46b..4b232ca8a 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -56,12 +56,11 @@ import de.anomic.kelondro.blob.BLOBHeap; import de.anomic.kelondro.blob.MapDataMining; import de.anomic.kelondro.index.RowCollection; import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.text.MetadataRowContainer; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceRow; import de.anomic.kelondro.text.MetadataRepository; -import de.anomic.kelondro.text.Word; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.ScoreCluster; @@ -70,6 +69,7 @@ import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.plasma.plasmaWordIndex; +import de.anomic.plasma.parser.Word; import de.anomic.server.serverCore; import de.anomic.server.serverSemaphore; import de.anomic.server.serverSystem; @@ -689,13 +689,13 @@ public final class yacy { wordIdxContainer = indexContainerIterator.next(); // the combined container will fit, read the container - final Iterator wordIdxEntries = wordIdxContainer.entries(); + final Iterator wordIdxEntries = wordIdxContainer.entries(); Reference iEntry; while (wordIdxEntries.hasNext()) { iEntry = wordIdxEntries.next(); final String urlHash = iEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - final MetadataRowContainer urlEntry = currentUrlDB.load(urlHash, null, 0); + final URLMetadataRow urlEntry = currentUrlDB.load(urlHash, null, 0); urlCounter++; minimizedUrlDB.store(urlEntry); if (urlCounter % 500 == 0) { @@ -705,7 +705,7 @@ public final class yacy { } if (wordCounter%500 == 0) { - wordChunkEndHash = wordIdxContainer.getWordHash(); + wordChunkEndHash = wordIdxContainer.getTermHash(); wordChunkEnd = System.currentTimeMillis(); final long duration = wordChunkEnd - wordChunkStart; log.logInfo(wordCounter + " words scanned " + @@ -881,10 +881,10 @@ public final class yacy { while (indexContainerIterator.hasNext()) { counter++; container = indexContainerIterator.next(); - bos.write((container.getWordHash()).getBytes()); + bos.write((container.getTermHash()).getBytes()); bos.write(serverCore.CRLF); if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash()); + log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash()); } } } @@ -898,17 +898,17 @@ public final class yacy { while (indexContainerIterator.hasNext()) { counter++; container = indexContainerIterator.next(); - bos.write((container.getWordHash()).getBytes()); + bos.write((container.getTermHash()).getBytes()); bos.write(serverCore.CRLF); if (counter % 500 == 0) { - log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash()); + log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getTermHash()); } } } bos.flush(); bos.close(); } - log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getWordHash())); + log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : container.getTermHash())); } catch (final IOException e) { log.logSevere("IOException", e); }