mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
introduced new RWI-Object.
This will be used for the final version of the collections. The new object is not yet used. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2966 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
f442af956c
commit
c9364246cc
|
@ -52,6 +52,7 @@ import java.util.LinkedList;
|
|||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexRWIEntryNew;
|
||||
import de.anomic.index.indexRWIEntryOld;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.urlPattern.plasmaURLPattern;
|
||||
|
@ -162,7 +163,10 @@ public final class transferRWI {
|
|||
if (p > 0) {
|
||||
wordHash = estring.substring(0, p);
|
||||
wordhashes[received] = wordHash;
|
||||
iEntry = new indexRWIEntryOld(estring.substring(p));
|
||||
if (estring.indexOf("x=") > 0)
|
||||
iEntry = new indexRWIEntryNew(estring.substring(p));
|
||||
else
|
||||
iEntry = new indexRWIEntryOld(estring.substring(p));
|
||||
urlHash = iEntry.urlHash();
|
||||
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
|
||||
int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
|
||||
|
|
|
@ -27,14 +27,12 @@
|
|||
|
||||
package de.anomic.index;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroMergeIterator;
|
||||
import de.anomic.kelondro.kelondroNaturalOrder;
|
||||
import de.anomic.kelondro.kelondroOrder;
|
||||
|
@ -44,16 +42,17 @@ import de.anomic.server.logging.serverLog;
|
|||
public class indexCachedRI implements indexRI {
|
||||
|
||||
private kelondroRow payloadrow;
|
||||
private kelondroOrder indexOrder = new kelondroNaturalOrder(true);
|
||||
private kelondroOrder indexOrder;
|
||||
private indexRAMRI riExtern, riIntern;
|
||||
private indexRI backend;
|
||||
public boolean busyCacheFlush; // shows if a cache flush is currently performed
|
||||
private int idleDivisor, busyDivisor;
|
||||
|
||||
public indexCachedRI(indexRAMRI riExtern, indexRAMRI riIntern, indexRI backend, kelondroRow payloadrow, serverLog log) {
|
||||
public indexCachedRI(indexRAMRI riExtern, indexRAMRI riIntern, indexRI backend, kelondroOrder payloadorder, kelondroRow payloadrow, serverLog log) {
|
||||
this.riExtern = riExtern;
|
||||
this.riIntern = riIntern;
|
||||
this.backend = backend;
|
||||
this.indexOrder = payloadorder;
|
||||
this.payloadrow = payloadrow;
|
||||
this.busyCacheFlush = false;
|
||||
this.busyDivisor = 5000;
|
||||
|
@ -152,38 +151,6 @@ public class indexCachedRI implements indexRI {
|
|||
busyCacheFlush = false;
|
||||
}
|
||||
|
||||
private static final int hour = 3600000;
|
||||
private static final int day = 86400000;
|
||||
|
||||
public static int microDateDays(Date modified) {
|
||||
return microDateDays(modified.getTime());
|
||||
}
|
||||
|
||||
public static int microDateDays(long modified) {
|
||||
// this calculates a virtual age from a given date
|
||||
// the purpose is to have an age in days of a given modified date
|
||||
// from a fixed standpoint in the past
|
||||
// one day has 60*60*24 seconds = 86400 seconds
|
||||
// we take mod 64**3 = 262144, this is the mask of the storage
|
||||
return (int) ((modified / day) % 262144);
|
||||
}
|
||||
|
||||
public static String microDateHoursStr(long time) {
|
||||
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
|
||||
}
|
||||
|
||||
public static int microDateHoursInt(long time) {
|
||||
return (int) ((time / hour) % 262144);
|
||||
}
|
||||
|
||||
public static int microDateHoursAge(String mdhs) {
|
||||
return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
|
||||
}
|
||||
|
||||
public static long reverseMicroDateDays(int microDateDays) {
|
||||
return ((long) microDateDays) * ((long) day);
|
||||
}
|
||||
|
||||
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
|
||||
// get from cache
|
||||
indexContainer container = riExtern.getContainer(wordHash, urlselection, true, maxTime);
|
||||
|
|
|
@ -31,7 +31,7 @@ import de.anomic.kelondro.kelondroRow;
|
|||
public interface indexRWIEntry {
|
||||
|
||||
public Object clone();
|
||||
public String toPropertyForm(boolean displayFormat);
|
||||
public String toPropertyForm();
|
||||
public kelondroRow.Entry toKelondroEntry();
|
||||
|
||||
public String urlHash();
|
||||
|
|
358
source/de/anomic/index/indexRWIEntryNew.java
Normal file
358
source/de/anomic/index/indexRWIEntryNew.java
Normal file
|
@ -0,0 +1,358 @@
|
|||
// indexRWIEntryNew.java
|
||||
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
||||
// first published 17.11.2006 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
|
||||
package de.anomic.index;
|
||||
|
||||
import de.anomic.kelondro.kelondroColumn;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.kelondro.kelondroRow.Entry;
|
||||
import de.anomic.plasma.plasmaURL;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
|
||||
|
||||
// this object stores attributes to URL references inside RWI collections
|
||||
|
||||
|
||||
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
|
||||
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
|
||||
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "lastModified"),
|
||||
new kelondroColumn("s", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "freshUntil"),
|
||||
new kelondroColumn("u", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "wordsInTitle"),
|
||||
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "wordsInText"),
|
||||
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "phrasesInText"),
|
||||
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
|
||||
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"),
|
||||
new kelondroColumn("x", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "llocal"),
|
||||
new kelondroColumn("y", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "lother"),
|
||||
new kelondroColumn("m", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlLength"),
|
||||
new kelondroColumn("n", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlComps"),
|
||||
new kelondroColumn("g", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 1, "typeofword"),
|
||||
new kelondroColumn("z", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 4, "flags"),
|
||||
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "hitcount"),
|
||||
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "posintext"),
|
||||
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posinphrase"),
|
||||
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posofphrase"),
|
||||
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "worddistance"),
|
||||
new kelondroColumn("k", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "reserve")
|
||||
});
|
||||
// available chars: b,e,j,q
|
||||
|
||||
// static properties
|
||||
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
||||
private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears
|
||||
private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short
|
||||
private static final int col_wordsInTitle = 3; // u 1 words in description/length (longer are better?)
|
||||
private static final int col_wordsInText = 4; // w 2 total number of words in document
|
||||
private static final int col_phrasesInText = 5; // p 2 total number of phrases in document
|
||||
private static final int col_doctype = 6; // d 1 type of document
|
||||
private static final int col_language = 7; // l 2 (guessed) language of document
|
||||
private static final int col_llocal = 8; // x 1 outlinks to same domain
|
||||
private static final int col_lother = 9; // y 1 outlinks to other domain
|
||||
private static final int col_urlLength = 10; // m 1 byte-length of complete URL
|
||||
private static final int col_urlComps = 11; // n 1 number of path components
|
||||
|
||||
// dynamic properties
|
||||
private static final int col_typeofword = 12; // g 1 grammatical classification
|
||||
private static final int col_flags = 13; // z 4 b64-encoded flags; this has space for 24 bit
|
||||
private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text
|
||||
private static final int col_posintext = 15; // t 2 first appearance of word in text
|
||||
private static final int col_posinphrase = 16; // r 1 position of word in its phrase
|
||||
private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
|
||||
private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
|
||||
private static final int col_reserve = 19; // k 1 reserve
|
||||
|
||||
// more needed attributes:
|
||||
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link, etc
|
||||
// - boolean: URL attributes
|
||||
|
||||
private kelondroRow.Entry entry;
|
||||
|
||||
public indexRWIEntryNew(String urlHash,
|
||||
int urlLength, // byte-length of complete URL
|
||||
int urlComps, // number of path components
|
||||
int titleLength, // length of description/length (longer are better?)
|
||||
int hitcount, // how often appears this word in the text
|
||||
int wordcount, // total number of words
|
||||
int phrasecount, // total number of phrases
|
||||
int posintext, // position of word in all words
|
||||
int posinphrase, // position of word in its phrase
|
||||
int posofphrase, // number of the phrase where word appears
|
||||
int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
|
||||
int sizeOfPage, // # of bytes of the page TODO: not needed any more
|
||||
long lastmodified, // last-modified time of the document where word appears
|
||||
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
int quality, // the entropy value
|
||||
String language, // (guessed) language of document
|
||||
char doctype, // type of document
|
||||
int outlinksSame, // outlinks to same domain
|
||||
int outlinksOther,// outlinks to other domain
|
||||
boolean local // not needed. TODO: remove this
|
||||
) {
|
||||
|
||||
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
||||
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
|
||||
this.entry = urlEntryRow.newEntry();
|
||||
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
|
||||
int mddct = plasmaWordIndex.microDateDays(updatetime);
|
||||
this.entry.setCol(col_urlhash, urlHash, null);
|
||||
this.entry.setCol(col_lastModified, mddlm);
|
||||
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
|
||||
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
|
||||
this.entry.setCol(col_wordsInText, wordcount);
|
||||
this.entry.setCol(col_phrasesInText, phrasecount);
|
||||
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
|
||||
this.entry.setCol(col_language, language, null);
|
||||
this.entry.setCol(col_llocal, outlinksSame);
|
||||
this.entry.setCol(col_lother, outlinksOther);
|
||||
this.entry.setCol(col_urlLength, urlLength);
|
||||
this.entry.setCol(col_urlComps, urlComps);
|
||||
this.entry.setCol(col_typeofword, 0); // TODO: grammatical classification
|
||||
this.entry.setCol(col_flags, null); // TODO: generate flags
|
||||
this.entry.setCol(col_hitcount, hitcount);
|
||||
this.entry.setCol(col_posintext, posintext);
|
||||
this.entry.setCol(col_posinphrase, posinphrase);
|
||||
this.entry.setCol(col_posofphrase, posofphrase);
|
||||
this.entry.setCol(col_worddistance, worddistance);
|
||||
this.entry.setCol(col_reserve, 0);
|
||||
}
|
||||
|
||||
public indexRWIEntryNew(indexRWIEntryOld oldEntry) {
|
||||
this.entry = urlEntryRow.newEntry();
|
||||
int mddlm = plasmaWordIndex.microDateDays(oldEntry.lastModified());
|
||||
int mddct = plasmaWordIndex.microDateDays(System.currentTimeMillis());
|
||||
this.entry.setCol(col_urlhash, oldEntry.urlHash(), null);
|
||||
this.entry.setCol(col_lastModified, mddlm);
|
||||
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
|
||||
this.entry.setCol(col_wordsInTitle, 20); // guessed
|
||||
this.entry.setCol(col_wordsInText, oldEntry.wordcount());
|
||||
this.entry.setCol(col_phrasesInText, oldEntry.phrasecount());
|
||||
this.entry.setCol(col_doctype, new byte[]{(byte) oldEntry.doctype()});
|
||||
this.entry.setCol(col_language, oldEntry.getLanguage(), null);
|
||||
this.entry.setCol(col_llocal, 0);
|
||||
this.entry.setCol(col_lother, 0);
|
||||
int domlen = plasmaURL.domLengthEstimation(oldEntry.urlHash());
|
||||
this.entry.setCol(col_urlLength, domlen * 2); // estimated
|
||||
this.entry.setCol(col_urlComps, domlen / 3); // estimated
|
||||
this.entry.setCol(col_typeofword, 0);
|
||||
this.entry.setCol(col_flags, null);
|
||||
this.entry.setCol(col_hitcount, oldEntry.hitcount());
|
||||
this.entry.setCol(col_posintext, oldEntry.posintext());
|
||||
this.entry.setCol(col_posinphrase, oldEntry.posinphrase());
|
||||
this.entry.setCol(col_posofphrase, oldEntry.posofphrase());
|
||||
this.entry.setCol(col_worddistance, oldEntry.worddistance());
|
||||
this.entry.setCol(col_reserve, 0);
|
||||
}
|
||||
|
||||
public indexRWIEntryNew(String urlHash, String code) {
|
||||
// the code is the external form of the row minus the leading urlHash entry
|
||||
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
|
||||
}
|
||||
|
||||
public indexRWIEntryNew(String external) {
|
||||
this.entry = urlEntryRow.newEntry(external, true);
|
||||
}
|
||||
|
||||
public indexRWIEntryNew(byte[] row) {
|
||||
this.entry = urlEntryRow.newEntry(row);
|
||||
}
|
||||
|
||||
public indexRWIEntryNew(kelondroRow.Entry rentry) {
|
||||
// FIXME: see if cloning is necessary
|
||||
this.entry = rentry;
|
||||
}
|
||||
|
||||
public static int days(long time) {
|
||||
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
|
||||
return (int) (time / 86400000);
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
byte[] b = new byte[urlEntryRow.objectsize()];
|
||||
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
|
||||
return new indexRWIEntryNew(b);
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
return entry.toPropertyForm(true, true, false);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
return this.entry;
|
||||
}
|
||||
|
||||
public String urlHash() {
|
||||
return this.entry.getColString(col_urlhash, null);
|
||||
}
|
||||
|
||||
public int quality() {
|
||||
return 0; // not used any more
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return plasmaWordIndex.microDateDays(lastModified());
|
||||
}
|
||||
|
||||
public long lastModified() {
|
||||
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
|
||||
}
|
||||
|
||||
public int hitcount() {
|
||||
return (int) this.entry.getColLong(col_hitcount);
|
||||
}
|
||||
|
||||
public int posintext() {
|
||||
return (int) this.entry.getColLong(col_posintext);
|
||||
}
|
||||
|
||||
public int posinphrase() {
|
||||
return (int) this.entry.getColLong(col_posinphrase);
|
||||
}
|
||||
|
||||
public int posofphrase() {
|
||||
return (int) this.entry.getColLong(col_posofphrase);
|
||||
}
|
||||
|
||||
public int wordcount() {
|
||||
return (int) this.entry.getColLong(col_wordsInText);
|
||||
}
|
||||
|
||||
public int phrasecount() {
|
||||
return (int) this.entry.getColLong(col_phrasesInText);
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return this.entry.getColString(col_language, null);
|
||||
}
|
||||
|
||||
public char getType() {
|
||||
return (char) this.entry.getColByte(col_doctype);
|
||||
}
|
||||
|
||||
public boolean isLocal() {
|
||||
return false; // not used
|
||||
}
|
||||
|
||||
public static indexRWIEntryNew combineDistance(indexRWIEntryNew ie1, indexRWIEntry ie2) {
|
||||
// returns a modified entry of the first argument
|
||||
ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
|
||||
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
|
||||
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
|
||||
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
|
||||
ie1.entry.setCol(col_wordsInText, (ie1.wordcount() + ie2.wordcount()) / 2);
|
||||
return ie1;
|
||||
}
|
||||
|
||||
public void combineDistance(indexRWIEntry oe) {
|
||||
combineDistance(this, oe);
|
||||
}
|
||||
|
||||
public int worddistance() {
|
||||
return (int) this.entry.getColLong(col_worddistance);
|
||||
}
|
||||
|
||||
public static final void min(indexRWIEntryNew t, indexRWIEntry other) {
|
||||
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
|
||||
if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount());
|
||||
if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount());
|
||||
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
|
||||
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
|
||||
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
|
||||
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
|
||||
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
|
||||
}
|
||||
|
||||
public static final void max(indexRWIEntryNew t, indexRWIEntry other) {
|
||||
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
|
||||
if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount());
|
||||
if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount());
|
||||
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
|
||||
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
|
||||
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
|
||||
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
|
||||
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
|
||||
}
|
||||
|
||||
|
||||
public void min(indexRWIEntry other) {
|
||||
min(this, other);
|
||||
}
|
||||
|
||||
public void max(indexRWIEntry other) {
|
||||
max(this, other);
|
||||
}
|
||||
|
||||
static void normalize(indexRWIEntryNew t, indexRWIEntry min, indexRWIEntry max) {
|
||||
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
|
||||
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
|
||||
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
|
||||
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm());
|
||||
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
|
||||
//System.out.println("min = " + min.toPropertyForm(true));
|
||||
//System.out.println("max = " + max.toPropertyForm(true));
|
||||
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
|
||||
t.entry.setCol(col_wordsInText , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
|
||||
t.entry.setCol(col_phrasesInText, (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
|
||||
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
|
||||
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
|
||||
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
|
||||
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
|
||||
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
|
||||
//System.out.println("out = " + t.toPropertyForm(true));
|
||||
}
|
||||
|
||||
public void normalize(indexRWIEntry min, indexRWIEntry max) {
|
||||
normalize(this, min, max);
|
||||
}
|
||||
|
||||
public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) {
|
||||
assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash();
|
||||
indexRWIEntryNew e = (indexRWIEntryNew) this.clone();
|
||||
e.normalize(min, max);
|
||||
return e;
|
||||
}
|
||||
|
||||
public boolean isNewer(indexRWIEntry other) {
|
||||
if (other == null) return true;
|
||||
if (this.lastModified() > other.lastModified()) return true;
|
||||
if (this.lastModified() == other.lastModified()) {
|
||||
if (this.quality() > other.quality()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isOlder(indexRWIEntry other) {
|
||||
if (other == null) return false;
|
||||
if (this.lastModified() < other.lastModified()) return true;
|
||||
if (this.lastModified() == other.lastModified()) {
|
||||
if (this.quality() < other.quality()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
// indexURLEntryNew.java
|
||||
// indexRWIEntryOld.java
|
||||
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
||||
// first published 21.07.2006 on http://www.anomic.de
|
||||
// first published 17.11.2006 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
|
@ -105,7 +105,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
|
|||
this.entry.setCol(col_lastModified, lastmodified);
|
||||
this.entry.setCol(col_hitcount, hitcount);
|
||||
this.entry.setCol(col_language, language, null);
|
||||
this.entry.setCol(col_doctype, (byte) doctype);
|
||||
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
|
||||
this.entry.setCol(col_localflag, (byte) ((local) ? plasmaURL.LT_LOCAL : plasmaURL.LT_GLOBAL));
|
||||
this.entry.setCol(col_posintext, posintext);
|
||||
this.entry.setCol(col_posinphrase, posinphrase);
|
||||
|
@ -122,7 +122,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
|
|||
}
|
||||
|
||||
public indexRWIEntryOld(String external) {
|
||||
this.entry = urlEntryRow.newEntry(external);
|
||||
this.entry = urlEntryRow.newEntry(external, false);
|
||||
}
|
||||
|
||||
public indexRWIEntryOld(byte[] row) {
|
||||
|
@ -140,8 +140,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
|
|||
return new indexRWIEntryOld(b);
|
||||
}
|
||||
|
||||
public String toPropertyForm(boolean displayFormat) {
|
||||
return entry.toPropertyForm(true, displayFormat, displayFormat);
|
||||
public String toPropertyForm() {
|
||||
return entry.toPropertyForm(true, false, false);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
|
@ -155,6 +155,10 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
|
|||
public int quality() {
|
||||
return (int) this.entry.getColLong(col_quality);
|
||||
}
|
||||
|
||||
public char doctype() {
|
||||
return (char) this.entry.getColByte(col_doctype);
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return plasmaWordIndex.microDateDays(lastModified());
|
||||
|
@ -255,7 +259,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
|
|||
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
|
||||
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
|
||||
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
|
||||
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm(true) + "\nmax=" + max.toPropertyForm(true));
|
||||
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm());
|
||||
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
|
||||
//System.out.println("min = " + min.toPropertyForm(true));
|
||||
//System.out.println("max = " + max.toPropertyForm(true));
|
||||
|
|
|
@ -180,7 +180,13 @@ public class indexURLEntryNew implements indexURLEntry {
|
|||
this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0")));
|
||||
this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
|
||||
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
|
||||
this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
|
||||
this.word = null;
|
||||
if (prop.containsKey("word")) {
|
||||
this.word = new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", "")));
|
||||
}
|
||||
if (prop.containsKey("wi")) {
|
||||
this.word = new indexRWIEntryNew(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")));
|
||||
}
|
||||
}
|
||||
|
||||
private StringBuffer corePropList() {
|
||||
|
@ -213,7 +219,8 @@ public class indexURLEntryNew implements indexURLEntry {
|
|||
|
||||
if (this.word != null) {
|
||||
// append also word properties
|
||||
s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
|
||||
if (this.word instanceof indexRWIEntryOld) s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm()));
|
||||
if (this.word instanceof indexRWIEntryNew) s.append(",wi=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm()));
|
||||
}
|
||||
return s;
|
||||
|
||||
|
|
|
@ -297,7 +297,8 @@ public class indexURLEntryOld implements indexURLEntry {
|
|||
|
||||
if (this.word != null) {
|
||||
// append also word properties
|
||||
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
|
||||
if (this.word instanceof indexRWIEntryOld) corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm()));
|
||||
if (this.word instanceof indexRWIEntryNew) corePropStr.append(",wi=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm()));
|
||||
}
|
||||
return corePropStr;
|
||||
|
||||
|
|
|
@ -143,9 +143,9 @@ public class kelondroRow {
|
|||
return new Entry(cells);
|
||||
}
|
||||
|
||||
public Entry newEntry(String external) {
|
||||
public Entry newEntry(String external, boolean decimalCardinal) {
|
||||
if (external == null) return null;
|
||||
return new Entry(external);
|
||||
return new Entry(external, decimalCardinal);
|
||||
}
|
||||
|
||||
public class Entry implements Comparable {
|
||||
|
@ -188,7 +188,7 @@ public class kelondroRow {
|
|||
}
|
||||
}
|
||||
|
||||
public Entry(String external) {
|
||||
public Entry(String external, boolean decimalCardinal) {
|
||||
// parse external form
|
||||
if (external.charAt(0) == '{') external = external.substring(1, external.length() - 1);
|
||||
String[] elts = external.split(",");
|
||||
|
@ -202,8 +202,17 @@ public class kelondroRow {
|
|||
nick = elts[i].substring(0, p).trim();
|
||||
if (p + 1 == elts[i].length())
|
||||
setCol(nick, null);
|
||||
else
|
||||
setCol(nick, elts[i].substring(p + 1).trim().getBytes());
|
||||
else {
|
||||
if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal)) {
|
||||
try {
|
||||
setCol(nick, Long.parseLong(elts[i].substring(p + 1).trim()));
|
||||
} catch (NumberFormatException e) {
|
||||
setCol(nick, 0);
|
||||
}
|
||||
} else {
|
||||
setCol(nick, elts[i].substring(p + 1).trim().getBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -416,7 +425,7 @@ public class kelondroRow {
|
|||
for (int i = 0; i < row.length; i++) {
|
||||
bb.append((longname) ? row[i].description() : row[i].nickname());
|
||||
bb.append('=');
|
||||
if ((row[i].celltype() == kelondroColumn.celltype_cardinal) && (decimalCardinal))
|
||||
if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal))
|
||||
bb.append(Long.toString(getColLong(i)));
|
||||
else
|
||||
bb.append(rowinstance, colstart[i], row[i].cellwidth());
|
||||
|
|
|
@ -1172,12 +1172,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
|
|||
if (hl > hr) return hl + 1;
|
||||
return hr + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
public String np(Object n) {
|
||||
if (n == null) return "NULL";
|
||||
return n.toString();
|
||||
}
|
||||
|
||||
*/
|
||||
public void print() throws IOException {
|
||||
super.print(false);
|
||||
int height = height();
|
||||
|
|
|
@ -2140,7 +2140,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
prop.put("type_results_" + i + "_size", Long.toString(urlentry.size()));
|
||||
prop.put("type_results_" + i + "_words", URLEncoder.encode(query.queryWords.toString(),"UTF-8"));
|
||||
prop.put("type_results_" + i + "_former", formerSearch);
|
||||
prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(urlhash) +
|
||||
prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(urlhash) +
|
||||
((plasmaURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") +
|
||||
(((wordURL = plasmaURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : ""));
|
||||
// adding snippet if available
|
||||
|
|
|
@ -1057,7 +1057,7 @@ public final class yacyClient {
|
|||
while (eenum.hasNext()) {
|
||||
entry = (indexRWIEntry) eenum.next();
|
||||
entrypost.append(indexes[i].getWordHash())
|
||||
.append(entry.toPropertyForm(false))
|
||||
.append(entry.toPropertyForm())
|
||||
.append(serverCore.crlfString);
|
||||
indexcount++;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user