- fixed some bugs in ranking computation

- introduced generalized method to organize ranked results (2 new classes)
- added a post-ranking after snippet-fetch (before: only listed) using the new ranking data structures
- fixed some missing data fields in RWI ranking attributes and correct hand-over between data structures

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4498 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2008-02-21 10:06:57 +00:00
parent f4c73d8c68
commit 727feb4358
10 changed files with 542 additions and 252 deletions

View File

@ -38,6 +38,7 @@ import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent;
@ -148,7 +149,7 @@ public final class search {
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchQuery theQuery = null;
ArrayList<ResultEntry> accu = null;
ArrayList<kelondroSortStack<ResultEntry>.stackElement> accu = null;
plasmaSearchEvent theSearch = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
@ -258,10 +259,10 @@ public final class search {
long timer = System.currentTimeMillis();
StringBuffer links = new StringBuffer();
String resource = null;
plasmaSearchEvent.ResultEntry entry;
kelondroSortStack<plasmaSearchEvent.ResultEntry>.stackElement entry;
for (int i = 0; i < accu.size(); i++) {
entry = (plasmaSearchEvent.ResultEntry) accu.get(i);
resource = entry.resource();
entry = accu.get(i);
resource = entry.element.resource();
if (resource != null) {
links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING);
}

View File

@ -86,4 +86,5 @@ public interface indexRWIEntry {
public boolean isOlder(indexRWIEntry other);
public int hashCode();
}

View File

@ -31,16 +31,14 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.kelondroAbstractOrder;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.yacy.yacyURL;
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry> implements kelondroOrder<indexRWIVarEntry> {
public class indexRWIEntryOrder {
private indexRWIVarEntry min, max;
private plasmaSearchRankingProfile ranking;
private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
@ -69,8 +67,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
mmf0.start(); // fork here
minmaxfinder mmf1 = new minmaxfinder(container, middle, container.size());
mmf1.run(); // execute other fork in this thread
if (this.min == null) this.min = mmf1.entryMin; else indexRWIVarEntry.min(this.min, mmf1.entryMin);
if (this.max == null) this.max = mmf1.entryMax; else indexRWIVarEntry.max(this.max, mmf1.entryMax);
if (this.min == null) this.min = mmf1.entryMin.clone(); else this.min.min(mmf1.entryMin);
if (this.max == null) this.max = mmf1.entryMax.clone(); else this.max.max(mmf1.entryMax);
Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> di = mmf1.domcount().entrySet().iterator();
while (di.hasNext()) {
@ -78,8 +76,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
}
try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish
if (this.min == null) this.min = mmf0.entryMin; else indexRWIVarEntry.min(this.min, mmf0.entryMin);
if (this.max == null) this.max = mmf0.entryMax; else indexRWIVarEntry.max(this.max, mmf0.entryMax);
if (this.min == null) this.min = mmf0.entryMin.clone(); else this.min.min(mmf0.entryMin);
if (this.max == null) this.max = mmf0.entryMax.clone(); else this.max.max(mmf0.entryMax);
di = mmf0.domcount().entrySet().iterator();
while (di.hasNext()) {
entry = di.next();
@ -93,8 +91,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
// run minmax in one thread
minmaxfinder mmf = new minmaxfinder(container, 0, container.size());
mmf.run(); // execute without multi-threading
if (this.min == null) this.min = mmf.entryMin; else indexRWIVarEntry.min(this.min, mmf.entryMin);
if (this.max == null) this.max = mmf.entryMax; else indexRWIVarEntry.max(this.max, mmf.entryMax);
if (this.min == null) this.min = mmf.entryMin.clone(); else this.min.min(mmf.entryMin);
if (this.max == null) this.max = mmf.entryMax.clone(); else this.max.max(mmf.entryMax);
Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> di = mmf.domcount().entrySet().iterator();
while (di.hasNext()) {
@ -109,44 +107,34 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
return result;
}
public kelondroOrder<indexRWIVarEntry> clone() {
return null;
}
public int authority(String urlHash) {
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
}
public long cardinal(byte[] key) {
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
}
public long cardinal(indexRWIRowEntry t) {
return cardinal(new indexRWIVarEntry(t));
}
public long cardinal(indexRWIVarEntry t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags();
long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
+ ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps))
+ ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength))
+ ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext))
+ ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase))
+ ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase))
+ ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ( (((t.phrasesintext()- min.phrasesintext() ) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency)
+ ( authority(t.urlHash()) << ranking.coeff_authority)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
+ ((max.posofphrase() == min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
+ ((max.posinphrase() == min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
+ ((max.worddistance() == min.worddistance()) ? 0 : (256 - (((t.worddistance() - min.worddistance() ) << 8) / (max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ ((max.virtualAge() == min.virtualAge()) ? 0 : (((t.virtualAge() - min.virtualAge() ) << 8) / (max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ((max.wordsintitle() == min.wordsintitle()) ? 0 : (((t.wordsintitle() - min.wordsintitle() ) << 8) / (max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ((max.wordsintext() == min.wordsintext()) ? 0 : (((t.wordsintext() - min.wordsintext() ) << 8) / (max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ((max.phrasesintext() == min.phrasesintext()) ? 0 : (((t.phrasesintext()- min.phrasesintext() ) << 8) / (max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ((max.llocal() == min.llocal()) ? 0 : (((t.llocal() - min.llocal() ) << 8) / (max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf
+ (authority(t.urlHash()) << ranking.coeff_authority)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0))
@ -163,20 +151,6 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
}
public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
long ca = cardinal(a);
long cb = cardinal(b);
return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
}
public String signature() {
return "rx";
}
public boolean wellformed(indexRWIVarEntry a) {
return true;
}
public static class minmaxfinder extends Thread {
@ -208,8 +182,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++)));
this.decodedEntries.add(iEntry);
// find min/max
if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry);
if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry);
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
// update domcount
dom = iEntry.urlHash().substring(6);
count = (Integer) doms.get(dom);

View File

@ -269,4 +269,7 @@ public final class indexRWIRowEntry implements indexRWIEntry {
return false;
}
public int hashCode() {
return this.urlHash().hashCode();
}
}

View File

@ -27,6 +27,7 @@
package de.anomic.index;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaWordIndex;
public class indexRWIVarEntry implements indexRWIEntry {
@ -40,7 +41,52 @@ public class indexRWIVarEntry implements indexRWIEntry {
worddistance, wordsintext, wordsintitle;
public double termFrequency;
public indexRWIVarEntry(indexRWIEntry e) {
public indexRWIVarEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
int outlinksOther, // outlinks to other domain
kelondroBitfield flags, // attributes to the url and to the word according the url
int worddistance,
double termfrequency
) {
if ((language == null) || (language.length() != 2)) language = "uk";
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
int mddct = plasmaWordIndex.microDateDays(updatetime);
this.flags = flags;
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
this.lastModified = lastmodified;
this.language = language;
this.urlHash = urlHash;
this.type = doctype;
this.hitcount = hitcount;
this.llocal = outlinksSame;
this.lother = outlinksOther;
this.phrasesintext = outlinksOther;
this.posintext = posintext;
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.urlcomps = urlComps;
this.urllength = urlLength;
this.virtualAge = mddlm;
this.worddistance = worddistance;
this.wordsintext = wordcount;
this.wordsintitle = titleLength;
this.termFrequency = termfrequency;
}
public indexRWIVarEntry(indexRWIRowEntry e) {
this.flags = e.flags();
this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified();
@ -60,18 +106,43 @@ public class indexRWIVarEntry implements indexRWIEntry {
this.worddistance = 0;
this.wordsintext = e.wordsintext();
this.wordsintitle = e.wordsintitle();
this.termFrequency = 0.0;
this.termFrequency = e.termFrequency();
}
public indexRWIVarEntry clone() {
indexRWIVarEntry c = new indexRWIVarEntry(
this.urlHash,
this.urllength,
this.urlcomps,
this.wordsintitle,
this.hitcount,
this.wordsintext,
this.phrasesintext,
this.posintext,
this.posinphrase,
this.posofphrase,
this.lastModified,
System.currentTimeMillis(),
this.language,
this.type,
this.llocal,
this.lother,
this.flags,
this.worddistance,
this.termFrequency);
return c;
}
public void join(indexRWIVarEntry oe) {
// combine the distance
this.worddistance = this.worddistance() + oe.worddistance() + Math.abs(this.posintext() - oe.posintext());
this.posintext = Math.min(this.posintext(), oe.posintext());
this.posinphrase = (this.posofphrase() == oe.posofphrase()) ? Math.min(this.posinphrase(), oe.posinphrase()) : 0;
this.posofphrase = Math.min(this.posofphrase(), oe.posofphrase());
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
this.posintext = Math.min(this.posintext, oe.posintext);
this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0;
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
// combine term frequency
this.wordsintext = this.wordsintext() + oe.wordsintext();
this.wordsintext = this.wordsintext + oe.wordsintext;
this.termFrequency = this.termFrequency + oe.termFrequency;
}
public kelondroBitfield flags() {
@ -191,66 +262,65 @@ public class indexRWIVarEntry implements indexRWIEntry {
return this.termFrequency;
}
public static final void min(indexRWIVarEntry t, indexRWIVarEntry other) {
public final void min(indexRWIVarEntry other) {
int v;
long w;
double d;
if (t.hitcount() > (v = other.hitcount())) t.hitcount = v;
if (t.llocal() > (v = other.llocal())) t.llocal = v;
if (t.lother() > (v = other.lother())) t.lother = v;
if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v;
if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v;
if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v;
if (t.posintext() > (v = other.posintext())) t.posintext = v;
if (t.posinphrase() > (v = other.posinphrase())) t.posinphrase = v;
if (t.posofphrase() > (v = other.posofphrase())) t.posofphrase = v;
if (t.worddistance() > (v = other.worddistance())) t.worddistance = v;
if (t.lastModified() > (w = other.lastModified())) t.lastModified = w;
if (t.freshUntil() > (w = other.freshUntil())) t.freshUntil = w;
if (t.urllength() > (v = other.urllength())) t.urllength = v;
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v;
if (t.termFrequency > (d = other.termFrequency())) t.termFrequency = d;
if (this.hitcount > (v = other.hitcount)) this.hitcount = v;
if (this.llocal > (v = other.llocal)) this.llocal = v;
if (this.lother > (v = other.lother)) this.lother = v;
if (this.virtualAge > (v = other.virtualAge)) this.virtualAge = v;
if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext > (v = other.posintext)) this.posintext = v;
if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v;
if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
if (this.worddistance > (v = other.worddistance)) this.worddistance = v;
if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
if (this.urllength > (v = other.urllength)) this.urllength = v;
if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
}
public static final void max(indexRWIVarEntry t, indexRWIVarEntry other) {
public final void max(indexRWIVarEntry other) {
int v;
long w;
double d;
if (t.hitcount() < (v = other.hitcount())) t.hitcount = v;
if (t.llocal() < (v = other.llocal())) t.llocal = v;
if (t.lother() < (v = other.lother())) t.lother = v;
if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v;
if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v;
if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v;
if (t.posintext() < (v = other.posintext())) t.posintext = v;
if (t.posinphrase() < (v = other.posinphrase())) t.posinphrase = v;
if (t.posofphrase() < (v = other.posofphrase())) t.posofphrase = v;
if (t.worddistance() < (v = other.worddistance())) t.worddistance = v;
if (t.lastModified() < (w = other.lastModified())) t.lastModified = w;
if (t.freshUntil() < (w = other.freshUntil())) t.freshUntil = w;
if (t.urllength() < (v = other.urllength())) t.urllength = v;
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v;
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v;
if (t.termFrequency < (d = other.termFrequency())) t.termFrequency = d;
if (this.hitcount < (v = other.hitcount)) this.hitcount = v;
if (this.llocal < (v = other.llocal)) this.llocal = v;
if (this.lother < (v = other.lother)) this.lother = v;
if (this.virtualAge < (v = other.virtualAge)) this.virtualAge = v;
if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext < (v = other.posintext)) this.posintext = v;
if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v;
if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
if (this.worddistance < (v = other.worddistance)) this.worddistance = v;
if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
if (this.urllength < (v = other.urllength)) this.urllength = v;
if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
}
public static void join(indexRWIVarEntry ie1, indexRWIEntry ie2) {
// returns a modified entry of the first argument
public void join(indexRWIEntry oe) {
// joins two entries into one entry
// combine the distance
ie1.worddistance = ie1.worddistance + ((ie2 instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) ie2).worddistance() : 0) + Math.abs(ie1.posintext() - ie2.posintext());
ie1.posintext = Math.min(ie1.posintext(), ie2.posintext());
ie1.posinphrase = (ie1.posofphrase() == ie2.posofphrase()) ? Math.min(ie1.posinphrase(), ie2.posinphrase()) : 0;
ie1.posofphrase = Math.min(ie1.posofphrase(), ie2.posofphrase());
this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext());
this.posintext = Math.min(this.posintext, oe.posintext());
this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
// combine term frequency
ie1.termFrequency = ie1.termFrequency + ie2.termFrequency();
ie1.wordsintext = ie1.wordsintext() + ie2.wordsintext();
}
public void join(indexRWIEntry oe) {
join(this, oe);
this.termFrequency = this.termFrequency + oe.termFrequency();
this.wordsintext = this.wordsintext + oe.wordsintext();
}
public int hashCode() {
return this.urlHash.hashCode();
}
}

View File

@ -0,0 +1,147 @@
// kelondroSortStack.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.02.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class kelondroSortStack<E> {
// implements a stack where elements 'float' on-top of the stack according to a weight value.
// objects pushed on the stack must implement the hashCode() method to provide a handle
// for a double-check.
protected TreeMap<Long, E> onstack; // object within the stack
protected HashSet<Integer> instack; // keeps track which element has been on the stack or is now in the offstack
protected int maxsize;
public kelondroSortStack(int maxsize) {
// the maxsize is the maximum number of entries in the stack
// if this is set to -1, the size is unlimited
this.onstack = new TreeMap<Long, E>();
this.instack = new HashSet<Integer>();
this.maxsize = maxsize;
}
public int size() {
return this.onstack.size();
}
public synchronized void push(stackElement se) {
push(se.element, se.weight);
}
public synchronized void push(E element, long weight) {
if (exists(element)) return;
// manipulate weight in such a way that it has no conflicts
Long w = new Long(weight);
while (this.onstack.containsKey(w)) w = new Long(w.longValue() + 1);
// put the element on the stack
this.onstack.put(w, element);
// register it for double-check
this.instack.add(element.hashCode());
// check maximum size of the stack an remove elements if the stack gets too large
if (this.maxsize <= 0) return;
while ((this.onstack.size() > 0) && (this.onstack.size() > this.maxsize)) {
this.onstack.remove(this.onstack.lastKey());
}
}
public synchronized stackElement top() {
// returns the element that is currently on top of the stack
if (this.onstack.size() == 0) return null;
Long w = this.onstack.firstKey();
E element = this.onstack.get(w);
return new stackElement(element, w.longValue());
}
public synchronized stackElement pop() {
// returns the element that is currently on top of the stack
// it is removed and added to the offstack list
// this is exactly the same as element(offstack.size())
if (this.onstack.size() == 0) return null;
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
return se;
}
public boolean exists(E element) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(element.hashCode());
}
public boolean exists(int hashcode) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(hashcode);
}
public stackElement get(int hashcode) {
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
Map.Entry<Long, E> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) return new stackElement(entry.getValue(), entry.getKey().longValue());
}
return null;
}
public stackElement remove(int hashcode) {
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
Map.Entry<Long, E> entry;
stackElement se;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) {
se = new stackElement(entry.getValue(), entry.getKey().longValue());
this.onstack.remove(se.weight);
return se;
}
}
return null;
}
public boolean bottom(long weight) {
// returns true if the element with that weight would be on the bottom of the stack after inserting
return weight > this.onstack.lastKey().longValue();
}
public class stackElement {
public long weight;
public E element;
public stackElement(E element, long weight) {
this.element = element;
this.weight = weight;
}
}
}

View File

@ -0,0 +1,135 @@
// kelondroSortStore.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.02.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.ArrayList;
import java.util.Iterator;
public class kelondroSortStore<E> extends kelondroSortStack<E> {
// extends the sortStack in such a way that it adds a list where objects, that had
// been pulled from the stack with pop are listed. Provides access methods to address
// specific elements in the list.
private ArrayList<stackElement> offstack; // objects that had been on the stack but had been removed
public kelondroSortStore(int maxsize) {
super(maxsize);
this.offstack = new ArrayList<stackElement>();
}
public int size() {
return super.onstack.size() + this.offstack.size();
}
public int sizeStore() {
return this.offstack.size();
}
public synchronized void push(E element, long weight) {
super.push(element, weight);
if (this.maxsize <= 0) return;
while ((this.onstack.size() > 0) && (super.onstack.size() + this.offstack.size() > this.maxsize)) {
this.onstack.remove(this.onstack.lastKey());
}
}
public synchronized stackElement pop() {
// returns the element that is currently on top of the stack
// it is removed and added to the offstack list
// this is exactly the same as element(offstack.size())
stackElement se = super.pop();
if (se == null) return null;
this.offstack.add(se);
return se;
}
public synchronized stackElement element(int position) {
// returns an element from a specific position. It is either taken from the offstack,
// or removed from the onstack.
// The offstack will grow if elements are not from the offstack and present at the onstack.
if (position < this.offstack.size()) {
return this.offstack.get(position);
}
if (position >= size()) return null; // we don't have that element
while (position >= this.offstack.size()) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack.get(position);
}
public ArrayList<stackElement> list(int count) {
// returns the specific amount of entries. If they are not yet present in the offstack, they are shifted there from the onstack
// if count is < 0 then all elements are taken
// the returned list is not cloned from the internal list and shall not be modified in any way (read-only)
if (count < 0) {
// shift all elements
while (this.onstack.size() > 0) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack;
}
if (size() < count) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + size() + ")");
while (this.onstack.size() < count) {
Long w = this.onstack.firstKey();
E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w.longValue());
this.offstack.add(se);
}
return this.offstack;
}
public stackElement get(int hashcode) {
stackElement se = super.get(hashcode);
if (se != null) return se;
Iterator<stackElement> j = this.offstack.iterator();
while (j.hasNext()) {
se = j.next();
if (se.element.hashCode() == hashcode) return se;
}
return null;
}
public stackElement remove(int hashcode) {
stackElement se = super.remove(hashcode);
if (se != null) return se;
for (int j = 0; j < this.offstack.size(); j++) {
se = this.offstack.get(j);
if (se.element.hashCode() == hashcode) {
this.offstack.remove(j);
return se;
}
}
return null;
}
}

View File

@ -38,9 +38,12 @@ import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.kelondro.kelondroSortStore;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling;
import de.anomic.server.logging.serverLog;
@ -77,8 +80,7 @@ public final class plasmaSearchEvent {
public TreeMap<String, Integer> IACount;
public String IAmaxcounthash, IAneardhthash;
private resultWorker[] workerThreads;
private ArrayList<ResultEntry> resultList;
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private kelondroSortStore<ResultEntry> result;
private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private long urlRetrievalAllTime;
@ -104,8 +106,7 @@ public final class plasmaSearchEvent {
this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.localSearchThread = null;
this.resultList = new ArrayList<ResultEntry>(10); // this is the result set which is filled up with search results, enriched with snippets
//this.resultListLock = 0; // no locked elements until now
this.result = new kelondroSortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes,
@ -202,7 +203,7 @@ public final class plasmaSearchEvent {
ResultEntry resultEntry;
yacyURL url;
synchronized (rankedCache) {
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) {
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (result.size() < (query.neededResults()))) {
url = uentry.comp().url();
if (url == null) continue;
//System.out.println("***DEBUG*** SEARCH RESULT URL=" + url.toNormalform(false, false));
@ -213,9 +214,7 @@ public final class plasmaSearchEvent {
snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
resultList.add(resultEntry);
}
result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
// add references
synchronized (rankedCache) {
@ -223,7 +222,7 @@ public final class plasmaSearchEvent {
}
}
}
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", resultList.size(), System.currentTimeMillis() - timer));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", result.size(), System.currentTimeMillis() - timer));
}
// clean up events
@ -466,8 +465,8 @@ public final class plasmaSearchEvent {
// if worker threads had been alive, but did not succeed, start them again to fetch missing links
if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) &&
(event.resultList.size() < query.neededResults() + 10) &&
((event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize()) > event.resultList.size())) {
(event.result.size() < query.neededResults() + 10) &&
(event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
// set new timeout
event.eventTime = System.currentTimeMillis();
// start worker threads to fetch urls and snippets
@ -508,7 +507,7 @@ public final class plasmaSearchEvent {
while (System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
if (resultList.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough
if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough
// get next entry
page = rankedCache.bestURL(true);
@ -531,21 +530,8 @@ public final class plasmaSearchEvent {
//System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
// place the result to the result vector
boolean d = false;
synchronized (resultList) {
doublecheck: for (int i = 0; i < resultList.size(); i++) {
if (resultList.get(i).urlcomps.url().hash().equals(resultEntry.urlcomps.url().hash())) {
d = true;
break doublecheck;
}
}
if (!d) {
resultList.add(resultEntry);
}
}
// add references
if (!d) synchronized (rankedCache) {
if (!result.exists(resultEntry)) {
result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
rankedCache.addReferences(resultEntry);
}
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
@ -554,10 +540,7 @@ public final class plasmaSearchEvent {
}
private boolean anyResultWith(String urlhash) {
for (int i = 0; i < resultList.size(); i++) {
if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true;
}
return false;
return result.exists(urlhash.hashCode());
}
private boolean anyFailureWith(String urlhash) {
@ -576,6 +559,11 @@ public final class plasmaSearchEvent {
public ResultEntry oneResult(int item) {
// first sleep a while to give accumulation threads a chance to work
if (this.result.sizeStore() > item) {
// we have the wanted result already in the result array .. return that
return this.result.element(item).element;
}
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// this is a search using remote search threads. Also the local search thread is started as background process
@ -586,45 +574,28 @@ public final class plasmaSearchEvent {
}
// now wait until as many remote worker threads have finished, as we want to display results
while ((this.primarySearchThreads != null) && (this.primarySearchThreads.length > item) && (anyWorkerAlive()) &&
((this.resultList.size() <= item) || (countFinishedRemoteSearch() <= item))) {
((result.size() <= item) || (countFinishedRemoteSearch() <= item))) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
}
// finally wait until enough results are there produced from the snippet fetch process
while ((anyWorkerAlive()) && (this.resultList.size() <= item)) {
while ((anyWorkerAlive()) && (result.size() <= item)) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
// finally, if there is something, return the result
synchronized (this.resultList) {
// check if we have enough entries
if (this.resultList.size() <= item) return null;
// fetch the best entry from the resultList, not the entry from item position
// whenever a specific entry was switched in its position and was returned here
// a moving pointer is set to assign that item position as not changeable
int bestpick = item; //postRankingFavourite(item);
if (bestpick != item) {
// switch the elements
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
this.resultList.set(item, buf);
}
//this.resultListLock = item; // lock the element; be prepared to return it
return (ResultEntry) this.resultList.get(item);
}
if (this.result.size() <= item) return null;
return this.result.element(item).element;
}
public ArrayList<ResultEntry> completeResults(long waitingtime) {
public ArrayList<kelondroSortStack<ResultEntry>.stackElement> completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime;
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
}
return this.resultList;
return this.result.list(this.result.size());
}
boolean secondarySearchStartet = false;
@ -789,7 +760,9 @@ public final class plasmaSearchEvent {
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
}
}
public int hashCode() {
return urlentry.hash().hashCode();
}
public String hash() {
return urlentry.hash();
}
@ -832,8 +805,10 @@ public final class plasmaSearchEvent {
public int lapp() {
return urlentry.lapp();
}
public indexRWIEntry word() {
return urlentry.word();
public indexRWIVarEntry word() {
indexRWIEntry word = urlentry.word();
assert word instanceof indexRWIVarEntry;
return (indexRWIVarEntry) word;
}
public boolean hasTextSnippet() {
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);

View File

@ -33,7 +33,6 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -45,6 +44,7 @@ import de.anomic.index.indexRWIVarEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroSortStack;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverProfiling;
@ -54,15 +54,15 @@ public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private TreeMap<Object, indexRWIVarEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
private HashMap<String, TreeMap<Object, indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
private kelondroSortStack<indexRWIVarEntry> stack;
private HashMap<String, kelondroSortStack<indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private plasmaSearchQuery query;
private int sortorder;
private int maxentries;
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private indexRWIEntryOrder order;
private HashMap<String, Object> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private HashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic
private int[] flagcount; // flag counter
private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
@ -74,17 +74,17 @@ public final class plasmaSearchRankingProcess {
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null;
this.sortedRWIEntries = new TreeMap<Object, indexRWIVarEntry>();
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIVarEntry>>();
this.stack = new kelondroSortStack<indexRWIVarEntry>(maxentries);
this.doubleDomCache = new HashMap<String, kelondroSortStack<indexRWIVarEntry>>();
this.handover = new HashMap<String, String>();
this.order = null;
this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking);
this.query = query;
this.maxentries = maxentries;
this.remote_peerCount = 0;
this.remote_indexCount = 0;
this.remote_resourceSize = 0;
this.local_resourceSize = 0;
this.urlhashes = new HashMap<String, Object>();
this.urlhashes = new HashMap<String, Integer>();
this.ref = new kelondroMScoreCluster<String>();
this.misses = new TreeSet<String>();
this.wordIndex = wordIndex;
@ -93,6 +93,10 @@ public final class plasmaSearchRankingProcess {
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
}
public long ranking(indexRWIVarEntry word) {
return order.cardinal(word);
}
public void execQuery() {
long timer = System.currentTimeMillis();
@ -150,21 +154,21 @@ public final class plasmaSearchRankingProcess {
// load url
if (sortorder == 0) {
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
this.stack.push(ientry, ientry.urlHash().hashCode());
this.urlhashes.put(ientry.urlHash(), ientry.urlHash().hashCode());
} else {
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
this.misses.add(ientry.urlHash());
} else {
u = uentry.comp().url().toNormalform(false, true);
this.sortedRWIEntries.put(u, ientry);
this.urlhashes.put(ientry.urlHash(), u);
this.stack.push(ientry, u.hashCode());
this.urlhashes.put(ientry.urlHash(), u.hashCode());
}
}
// interrupt if we have enough
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
if ((query.neededResults() > 0) && (this.misses.size() + this.stack.size() > query.neededResults())) break loop;
} // end loop
}
@ -182,22 +186,20 @@ public final class plasmaSearchRankingProcess {
}
long timer = System.currentTimeMillis();
if (this.order == null) {
this.order = new indexRWIEntryOrder(query.ranking);
}
// normalize entries
ArrayList<indexRWIVarEntry> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));
// normalize entries and get ranking
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
Iterator<indexRWIVarEntry> i = decodedEntries.iterator();
indexRWIVarEntry iEntry, l;
long biggestEntry = 0;
//long s0 = System.currentTimeMillis();
indexRWIVarEntry iEntry;
Long r;
while (i.hasNext()) {
iEntry = i.next();
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
assert (iEntry.urlHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
@ -206,31 +208,32 @@ public final class plasmaSearchRankingProcess {
// kick out entries that are too bad according to current findings
r = new Long(order.cardinal(iEntry));
if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
// check constraints
if (!testFlags(iEntry)) continue;
// check document domain
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
}
if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) {
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.urlHash())) continue;
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
stack.push(iEntry, r);
} else {
if (r.longValue() > biggestEntry) {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
} else {
// double-check
if (urlhashes.containsKey(iEntry.urlHash())) continue;
l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
urlhashes.remove(l.urlHash());
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
sortedRWIEntries.put(r, iEntry);
biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey()));
stack.push(iEntry, r);
}
}
@ -271,85 +274,69 @@ public final class plasmaSearchRankingProcess {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
private synchronized kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removed this entry from the list
Object bestEntry;
TreeMap<Object, indexRWIVarEntry> m;
indexRWIVarEntry rwi;
while (sortedRWIEntries.size() > 0) {
bestEntry = sortedRWIEntries.firstKey();
rwi = sortedRWIEntries.remove(bestEntry);
if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
kelondroSortStack<indexRWIVarEntry> m;
kelondroSortStack<indexRWIVarEntry>.stackElement rwi;
while (stack.size() > 0) {
rwi = stack.pop();
if (!skipDoubleDom) return rwi;
// check doubledom
String domhash = rwi.urlHash().substring(6);
String domhash = rwi.element.urlHash().substring(6);
m = this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
m = new TreeMap<Object, indexRWIVarEntry>();
m = new kelondroSortStack<indexRWIVarEntry>(-1);
this.doubleDomCache.put(domhash, m);
return new Object[]{bestEntry, rwi};
return rwi;
}
// second appearances of dom
m.put(bestEntry, rwi);
m.push(rwi);
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
bestEntry = null;
Object o;
indexRWIVarEntry bestrwi = null;
Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
kelondroSortStack<indexRWIVarEntry>.stackElement bestEntry = null;
kelondroSortStack<indexRWIVarEntry>.stackElement o;
while (i.hasNext()) {
m = i.next();
if (m.size() == 0) continue;
if (bestEntry == null) {
bestEntry = m.firstKey();
bestrwi = m.remove(bestEntry);
bestEntry = m.top();
continue;
}
o = m.firstKey();
rwi = m.remove(o);
if (o instanceof Long) {
if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
bestEntry = o;
bestrwi = rwi;
}
}
if (o instanceof String) {
if (((String) o).compareTo((String) bestEntry) < 0) {
bestEntry = o;
bestrwi = rwi;
}
o = m.top();
if (o.weight < bestEntry.weight) {
bestEntry = o;
}
}
if (bestrwi == null) return null;
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestrwi.urlHash().substring(6));
m.remove(bestEntry);
return new Object[]{bestEntry, bestrwi};
m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6));
o = m.pop();
assert o.element.urlHash().equals(bestEntry.element.urlHash());
return bestEntry;
}
public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list
while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
Object[] obrwi = bestRWI(skipDoubleDom);
Object bestEntry = obrwi[0];
indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1];
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
while ((stack.size() > 0) || (size() > 0)) {
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
indexURLEntry u = wordIndex.loadedURL.load(obrwi.element.urlHash(), obrwi.element, obrwi.weight);
if (u != null) {
indexURLEntry.Components comp = u.comp();
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
misses.add(ientry.urlHash());
misses.add(obrwi.element.urlHash());
}
return null;
}
public synchronized int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = sortedRWIEntries.size();
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
int c = stack.size();
Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += i.next().size();
return c;
}
@ -362,7 +349,7 @@ public final class plasmaSearchRankingProcess {
public int filteredCount() {
// the number of index entries that are considered as result set
return this.sortedRWIEntries.size();
return this.stack.size();
}
public int getRemoteIndexCount() {
@ -385,14 +372,11 @@ public final class plasmaSearchRankingProcess {
return this.local_resourceSize;
}
public indexRWIEntry remove(String urlHash) {
Object r = (Long) urlhashes.get(urlHash);
if (r == null) return null;
assert sortedRWIEntries.containsKey(r);
indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r);
kelondroSortStack<indexRWIVarEntry>.stackElement se = stack.remove(urlHash.hashCode());
if (se == null) return null;
urlhashes.remove(urlHash);
return iEntry;
return se.element;
}
public Iterator<String> miss() {

View File

@ -1,7 +1,7 @@
# YaCy Network Group Definition
# -----------------------------
# This is an addition to the yacy.network.unit configuration file.
# This file is adressed by the network.group.definition property in yacy.init
# This file is addressed by the network.group.definition property in yacy.init
# The purpose of a group within a network is that some parts of a network may be managed independently,
# while the content of the network stays private for the whole network, mostly for a special purpose.
# This file needs to be configured if someone wants to participate with several peers to the network,