From c9364246cc6702c61a5d88f7d0c5d091a4f1a252 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 17 Nov 2006 14:17:20 +0000 Subject: [PATCH] introduced new RWI-Object. This will be used for the final version of the collections. The new object is not yet used. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2966 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/transferRWI.java | 6 +- source/de/anomic/index/indexCachedRI.java | 39 +- source/de/anomic/index/indexRWIEntry.java | 2 +- source/de/anomic/index/indexRWIEntryNew.java | 358 ++++++++++++++++++ source/de/anomic/index/indexRWIEntryOld.java | 18 +- source/de/anomic/index/indexURLEntryNew.java | 11 +- source/de/anomic/index/indexURLEntryOld.java | 3 +- source/de/anomic/kelondro/kelondroRow.java | 21 +- source/de/anomic/kelondro/kelondroTree.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- source/de/anomic/yacy/yacyClient.java | 2 +- 11 files changed, 408 insertions(+), 58 deletions(-) create mode 100644 source/de/anomic/index/indexRWIEntryNew.java diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 2fa8ea4fd..d5a0d392a 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -52,6 +52,7 @@ import java.util.LinkedList; import de.anomic.http.httpHeader; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryNew; import de.anomic.index.indexRWIEntryOld; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -162,7 +163,10 @@ public final class transferRWI { if (p > 0) { wordHash = estring.substring(0, p); wordhashes[received] = wordHash; - iEntry = new indexRWIEntryOld(estring.substring(p)); + if (estring.indexOf("x=") > 0) + iEntry = new indexRWIEntryNew(estring.substring(p)); + else + iEntry = new indexRWIEntryOld(estring.substring(p)); urlHash = iEntry.urlHash(); if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { int deleted = sb.wordIndex.tryRemoveURLs(urlHash); diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java index 1fdf34efb..05258d06e 100644 --- a/source/de/anomic/index/indexCachedRI.java +++ b/source/de/anomic/index/indexCachedRI.java @@ -27,14 +27,12 @@ package de.anomic.index; -import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroOrder; @@ -44,16 +42,17 @@ import de.anomic.server.logging.serverLog; public class indexCachedRI implements indexRI { private kelondroRow payloadrow; - private kelondroOrder indexOrder = new kelondroNaturalOrder(true); + private kelondroOrder indexOrder; private indexRAMRI riExtern, riIntern; private indexRI backend; public boolean busyCacheFlush; // shows if a cache flush is currently performed private int idleDivisor, busyDivisor; - public indexCachedRI(indexRAMRI riExtern, indexRAMRI riIntern, indexRI backend, kelondroRow payloadrow, serverLog log) { + public indexCachedRI(indexRAMRI riExtern, indexRAMRI riIntern, indexRI backend, kelondroOrder payloadorder, kelondroRow payloadrow, serverLog log) { this.riExtern = riExtern; this.riIntern = riIntern; this.backend = backend; + this.indexOrder = payloadorder; this.payloadrow = payloadrow; this.busyCacheFlush = false; this.busyDivisor = 5000; @@ -152,38 +151,6 @@ public class indexCachedRI implements indexRI { busyCacheFlush = false; } - private static final int hour = 3600000; - private static final int day = 86400000; - - public static int microDateDays(Date modified) { - return microDateDays(modified.getTime()); - } - - public static int microDateDays(long modified) { - // this calculates a virtual age from a given date - // the purpose is to have an age in days of a given modified date - // from a fixed standpoint in the past - // one day has 60*60*24 seconds = 86400 seconds - // we take mod 64**3 = 262144, this is the mask of the storage - return (int) ((modified / day) % 262144); - } - - public static String microDateHoursStr(long time) { - return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3); - } - - public static int microDateHoursInt(long time) { - return (int) ((time / hour) % 262144); - } - - public static int microDateHoursAge(String mdhs) { - return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs); - } - - public static long reverseMicroDateDays(int microDateDays) { - return ((long) microDateDays) * ((long) day); - } - public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) { // get from cache indexContainer container = riExtern.getContainer(wordHash, urlselection, true, maxTime); diff --git a/source/de/anomic/index/indexRWIEntry.java b/source/de/anomic/index/indexRWIEntry.java index f4e7caa84..22c509695 100644 --- a/source/de/anomic/index/indexRWIEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -31,7 +31,7 @@ import de.anomic.kelondro.kelondroRow; public interface indexRWIEntry { public Object clone(); - public String toPropertyForm(boolean displayFormat); + public String toPropertyForm(); public kelondroRow.Entry toKelondroEntry(); public String urlHash(); diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java new file mode 100644 index 000000000..9d67779e1 --- /dev/null +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -0,0 +1,358 @@ +// indexRWIEntryNew.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 17.11.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.index; + +import de.anomic.kelondro.kelondroColumn; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRow.Entry; +import de.anomic.plasma.plasmaURL; +import de.anomic.plasma.plasmaWordIndex; +import de.anomic.yacy.yacySeedDB; + +public class indexRWIEntryNew implements Cloneable, indexRWIEntry { + + // this object stores attributes to URL references inside RWI collections + + + public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ + new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"), + new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "lastModified"), + new kelondroColumn("s", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "freshUntil"), + new kelondroColumn("u", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "wordsInTitle"), + new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "wordsInText"), + new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "phrasesInText"), + new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"), + new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"), + new kelondroColumn("x", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "llocal"), + new kelondroColumn("y", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "lother"), + new kelondroColumn("m", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlLength"), + new kelondroColumn("n", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlComps"), + new kelondroColumn("g", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 1, "typeofword"), + new kelondroColumn("z", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 4, "flags"), + new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "hitcount"), + new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "posintext"), + new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posinphrase"), + new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posofphrase"), + new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "worddistance"), + new kelondroColumn("k", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "reserve") + }); + // available chars: b,e,j,q + + // static properties + private static final int col_urlhash = 0; // h 12 the url hash b64-encoded + private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears + private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short + private static final int col_wordsInTitle = 3; // u 1 words in description/length (longer are better?) + private static final int col_wordsInText = 4; // w 2 total number of words in document + private static final int col_phrasesInText = 5; // p 2 total number of phrases in document + private static final int col_doctype = 6; // d 1 type of document + private static final int col_language = 7; // l 2 (guessed) language of document + private static final int col_llocal = 8; // x 1 outlinks to same domain + private static final int col_lother = 9; // y 1 outlinks to other domain + private static final int col_urlLength = 10; // m 1 byte-length of complete URL + private static final int col_urlComps = 11; // n 1 number of path components + + // dynamic properties + private static final int col_typeofword = 12; // g 1 grammatical classification + private static final int col_flags = 13; // z 4 b64-encoded flags; this has space for 24 bit + private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text + private static final int col_posintext = 15; // t 2 first appearance of word in text + private static final int col_posinphrase = 16; // r 1 position of word in its phrase + private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears + private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search + private static final int col_reserve = 19; // k 1 reserve + + // more needed attributes: + // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link, etc + // - boolean: URL attributes + + private kelondroRow.Entry entry; + + public indexRWIEntryNew(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) + int hitcount, // how often appears this word in the text + int wordcount, // total number of words + int phrasecount, // total number of phrases + int posintext, // position of word in all words + int posinphrase, // position of word in its phrase + int posofphrase, // number of the phrase where word appears + int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + int sizeOfPage, // # of bytes of the page TODO: not needed any more + long lastmodified, // last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + int quality, // the entropy value + String language, // (guessed) language of document + char doctype, // type of document + int outlinksSame, // outlinks to same domain + int outlinksOther,// outlinks to other domain + boolean local // not needed. TODO: remove this + ) { + + assert (urlHash.length() == 12) : "urlhash = " + urlHash; + if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; + this.entry = urlEntryRow.newEntry(); + int mddlm = plasmaWordIndex.microDateDays(lastmodified); + int mddct = plasmaWordIndex.microDateDays(updatetime); + this.entry.setCol(col_urlhash, urlHash, null); + this.entry.setCol(col_lastModified, mddlm); + this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation + this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words + this.entry.setCol(col_wordsInText, wordcount); + this.entry.setCol(col_phrasesInText, phrasecount); + this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); + this.entry.setCol(col_language, language, null); + this.entry.setCol(col_llocal, outlinksSame); + this.entry.setCol(col_lother, outlinksOther); + this.entry.setCol(col_urlLength, urlLength); + this.entry.setCol(col_urlComps, urlComps); + this.entry.setCol(col_typeofword, 0); // TODO: grammatical classification + this.entry.setCol(col_flags, null); // TODO: generate flags + this.entry.setCol(col_hitcount, hitcount); + this.entry.setCol(col_posintext, posintext); + this.entry.setCol(col_posinphrase, posinphrase); + this.entry.setCol(col_posofphrase, posofphrase); + this.entry.setCol(col_worddistance, worddistance); + this.entry.setCol(col_reserve, 0); + } + + public indexRWIEntryNew(indexRWIEntryOld oldEntry) { + this.entry = urlEntryRow.newEntry(); + int mddlm = plasmaWordIndex.microDateDays(oldEntry.lastModified()); + int mddct = plasmaWordIndex.microDateDays(System.currentTimeMillis()); + this.entry.setCol(col_urlhash, oldEntry.urlHash(), null); + this.entry.setCol(col_lastModified, mddlm); + this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation + this.entry.setCol(col_wordsInTitle, 20); // guessed + this.entry.setCol(col_wordsInText, oldEntry.wordcount()); + this.entry.setCol(col_phrasesInText, oldEntry.phrasecount()); + this.entry.setCol(col_doctype, new byte[]{(byte) oldEntry.doctype()}); + this.entry.setCol(col_language, oldEntry.getLanguage(), null); + this.entry.setCol(col_llocal, 0); + this.entry.setCol(col_lother, 0); + int domlen = plasmaURL.domLengthEstimation(oldEntry.urlHash()); + this.entry.setCol(col_urlLength, domlen * 2); // estimated + this.entry.setCol(col_urlComps, domlen / 3); // estimated + this.entry.setCol(col_typeofword, 0); + this.entry.setCol(col_flags, null); + this.entry.setCol(col_hitcount, oldEntry.hitcount()); + this.entry.setCol(col_posintext, oldEntry.posintext()); + this.entry.setCol(col_posinphrase, oldEntry.posinphrase()); + this.entry.setCol(col_posofphrase, oldEntry.posofphrase()); + this.entry.setCol(col_worddistance, oldEntry.worddistance()); + this.entry.setCol(col_reserve, 0); + } + + public indexRWIEntryNew(String urlHash, String code) { + // the code is the external form of the row minus the leading urlHash entry + this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); + } + + public indexRWIEntryNew(String external) { + this.entry = urlEntryRow.newEntry(external, true); + } + + public indexRWIEntryNew(byte[] row) { + this.entry = urlEntryRow.newEntry(row); + } + + public indexRWIEntryNew(kelondroRow.Entry rentry) { + // FIXME: see if cloning is necessary + this.entry = rentry; + } + + public static int days(long time) { + // calculates the number of days since 1.1.1970 and returns this as 4-byte array + return (int) (time / 86400000); + } + + public Object clone() { + byte[] b = new byte[urlEntryRow.objectsize()]; + System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize()); + return new indexRWIEntryNew(b); + } + + public String toPropertyForm() { + return entry.toPropertyForm(true, true, false); + } + + public Entry toKelondroEntry() { + return this.entry; + } + + public String urlHash() { + return this.entry.getColString(col_urlhash, null); + } + + public int quality() { + return 0; // not used any more + } + + public int virtualAge() { + return plasmaWordIndex.microDateDays(lastModified()); + } + + public long lastModified() { + return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified)); + } + + public int hitcount() { + return (int) this.entry.getColLong(col_hitcount); + } + + public int posintext() { + return (int) this.entry.getColLong(col_posintext); + } + + public int posinphrase() { + return (int) this.entry.getColLong(col_posinphrase); + } + + public int posofphrase() { + return (int) this.entry.getColLong(col_posofphrase); + } + + public int wordcount() { + return (int) this.entry.getColLong(col_wordsInText); + } + + public int phrasecount() { + return (int) this.entry.getColLong(col_phrasesInText); + } + + public String getLanguage() { + return this.entry.getColString(col_language, null); + } + + public char getType() { + return (char) this.entry.getColByte(col_doctype); + } + + public boolean isLocal() { + return false; // not used + } + + public static indexRWIEntryNew combineDistance(indexRWIEntryNew ie1, indexRWIEntry ie2) { + // returns a modified entry of the first argument + ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext())); + ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext())); + ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/); + ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase())); + ie1.entry.setCol(col_wordsInText, (ie1.wordcount() + ie2.wordcount()) / 2); + return ie1; + } + + public void combineDistance(indexRWIEntry oe) { + combineDistance(this, oe); + } + + public int worddistance() { + return (int) this.entry.getColLong(col_worddistance); + } + + public static final void min(indexRWIEntryNew t, indexRWIEntry other) { + if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); + if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount()); + if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount()); + if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext()); + if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); + if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); + if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); + if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); + } + + public static final void max(indexRWIEntryNew t, indexRWIEntry other) { + if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); + if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount()); + if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount()); + if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext()); + if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); + if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); + if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); + if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); + } + + + public void min(indexRWIEntry other) { + min(this, other); + } + + public void max(indexRWIEntry other) { + max(this, other); + } + + static void normalize(indexRWIEntryNew t, indexRWIEntry min, indexRWIEntry max) { + assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash(); + assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash(); + assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash(); + if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm()); + //System.out.println("Normalize:\nentry = " + t.toPropertyForm(true)); + //System.out.println("min = " + min.toPropertyForm(true)); + //System.out.println("max = " + max.toPropertyForm(true)); + t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); + t.entry.setCol(col_wordsInText , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount())); + t.entry.setCol(col_phrasesInText, (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount())); + t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); + t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); + t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); + t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat. + t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); + //System.out.println("out = " + t.toPropertyForm(true)); + } + + public void normalize(indexRWIEntry min, indexRWIEntry max) { + normalize(this, min, max); + } + + public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) { + assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash(); + indexRWIEntryNew e = (indexRWIEntryNew) this.clone(); + e.normalize(min, max); + return e; + } + + public boolean isNewer(indexRWIEntry other) { + if (other == null) return true; + if (this.lastModified() > other.lastModified()) return true; + if (this.lastModified() == other.lastModified()) { + if (this.quality() > other.quality()) return true; + } + return false; + } + + public boolean isOlder(indexRWIEntry other) { + if (other == null) return false; + if (this.lastModified() < other.lastModified()) return true; + if (this.lastModified() == other.lastModified()) { + if (this.quality() < other.quality()) return true; + } + return false; + } + +} \ No newline at end of file diff --git a/source/de/anomic/index/indexRWIEntryOld.java b/source/de/anomic/index/indexRWIEntryOld.java index 399df2144..23582916c 100644 --- a/source/de/anomic/index/indexRWIEntryOld.java +++ b/source/de/anomic/index/indexRWIEntryOld.java @@ -1,6 +1,6 @@ -// indexURLEntryNew.java +// indexRWIEntryOld.java // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 21.07.2006 on http://www.anomic.de +// first published 17.11.2006 on http://www.anomic.de // // This is a part of YaCy, a peer-to-peer based web search engine // @@ -105,7 +105,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { this.entry.setCol(col_lastModified, lastmodified); this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_language, language, null); - this.entry.setCol(col_doctype, (byte) doctype); + this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_localflag, (byte) ((local) ? plasmaURL.LT_LOCAL : plasmaURL.LT_GLOBAL)); this.entry.setCol(col_posintext, posintext); this.entry.setCol(col_posinphrase, posinphrase); @@ -122,7 +122,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { } public indexRWIEntryOld(String external) { - this.entry = urlEntryRow.newEntry(external); + this.entry = urlEntryRow.newEntry(external, false); } public indexRWIEntryOld(byte[] row) { @@ -140,8 +140,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { return new indexRWIEntryOld(b); } - public String toPropertyForm(boolean displayFormat) { - return entry.toPropertyForm(true, displayFormat, displayFormat); + public String toPropertyForm() { + return entry.toPropertyForm(true, false, false); } public Entry toKelondroEntry() { @@ -155,6 +155,10 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { public int quality() { return (int) this.entry.getColLong(col_quality); } + + public char doctype() { + return (char) this.entry.getColByte(col_doctype); + } public int virtualAge() { return plasmaWordIndex.microDateDays(lastModified()); @@ -255,7 +259,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash(); assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash(); assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash(); - if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm(true) + "\nmax=" + max.toPropertyForm(true)); + if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm()); //System.out.println("Normalize:\nentry = " + t.toPropertyForm(true)); //System.out.println("min = " + min.toPropertyForm(true)); //System.out.println("max = " + max.toPropertyForm(true)); diff --git a/source/de/anomic/index/indexURLEntryNew.java b/source/de/anomic/index/indexURLEntryNew.java index a79200e77..a3aebc110 100644 --- a/source/de/anomic/index/indexURLEntryNew.java +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -180,7 +180,13 @@ public class indexURLEntryNew implements indexURLEntry { this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0"))); this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null); - this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; + this.word = null; + if (prop.containsKey("word")) { + this.word = new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))); + } + if (prop.containsKey("wi")) { + this.word = new indexRWIEntryNew(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))); + } } private StringBuffer corePropList() { @@ -213,7 +219,8 @@ public class indexURLEntryNew implements indexURLEntry { if (this.word != null) { // append also word properties - s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false))); + if (this.word instanceof indexRWIEntryOld) s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm())); + if (this.word instanceof indexRWIEntryNew) s.append(",wi=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm())); } return s; diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index 1ad4527f7..17da67913 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -297,7 +297,8 @@ public class indexURLEntryOld implements indexURLEntry { if (this.word != null) { // append also word properties - corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false))); + if (this.word instanceof indexRWIEntryOld) corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm())); + if (this.word instanceof indexRWIEntryNew) corePropStr.append(",wi=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm())); } return corePropStr; diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index 3a5b0359e..4cafca8f7 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -143,9 +143,9 @@ public class kelondroRow { return new Entry(cells); } - public Entry newEntry(String external) { + public Entry newEntry(String external, boolean decimalCardinal) { if (external == null) return null; - return new Entry(external); + return new Entry(external, decimalCardinal); } public class Entry implements Comparable { @@ -188,7 +188,7 @@ public class kelondroRow { } } - public Entry(String external) { + public Entry(String external, boolean decimalCardinal) { // parse external form if (external.charAt(0) == '{') external = external.substring(1, external.length() - 1); String[] elts = external.split(","); @@ -202,8 +202,17 @@ public class kelondroRow { nick = elts[i].substring(0, p).trim(); if (p + 1 == elts[i].length()) setCol(nick, null); - else - setCol(nick, elts[i].substring(p + 1).trim().getBytes()); + else { + if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal)) { + try { + setCol(nick, Long.parseLong(elts[i].substring(p + 1).trim())); + } catch (NumberFormatException e) { + setCol(nick, 0); + } + } else { + setCol(nick, elts[i].substring(p + 1).trim().getBytes()); + } + } } } } @@ -416,7 +425,7 @@ public class kelondroRow { for (int i = 0; i < row.length; i++) { bb.append((longname) ? row[i].description() : row[i].nickname()); bb.append('='); - if ((row[i].celltype() == kelondroColumn.celltype_cardinal) && (decimalCardinal)) + if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal)) bb.append(Long.toString(getColLong(i))); else bb.append(rowinstance, colstart[i], row[i].cellwidth()); diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index 997472810..6531e847e 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -1172,12 +1172,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { if (hl > hr) return hl + 1; return hr + 1; } - + /* public String np(Object n) { if (n == null) return "NULL"; return n.toString(); } - + */ public void print() throws IOException { super.print(false); int height = height(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index cafeeccf1..57eeda707 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2140,7 +2140,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("type_results_" + i + "_size", Long.toString(urlentry.size())); prop.put("type_results_" + i + "_words", URLEncoder.encode(query.queryWords.toString(),"UTF-8")); prop.put("type_results_" + i + "_former", formerSearch); - prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(urlhash) + + prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(urlhash) + ((plasmaURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + (((wordURL = plasmaURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : "")); // adding snippet if available diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 615c874c8..16ba9f067 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -1057,7 +1057,7 @@ public final class yacyClient { while (eenum.hasNext()) { entry = (indexRWIEntry) eenum.next(); entrypost.append(indexes[i].getWordHash()) - .append(entry.toPropertyForm(false)) + .append(entry.toPropertyForm()) .append(serverCore.crlfString); indexcount++; }