mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- fixed some bugs in ranking computation
- introduced generalized method to organize ranked results (2 new classes) - added a post-ranking after snippet-fetch (before: only listed) using the new ranking data structures - fixed some missing data fields in RWI ranking attributes and correct hand-over between data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4498 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
f4c73d8c68
commit
727feb4358
|
@ -38,6 +38,7 @@ import de.anomic.http.httpHeader;
|
|||
import de.anomic.index.indexContainer;
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.kelondro.kelondroSortStack;
|
||||
import de.anomic.net.natLib;
|
||||
import de.anomic.plasma.plasmaProfiling;
|
||||
import de.anomic.plasma.plasmaSearchEvent;
|
||||
|
@ -148,7 +149,7 @@ public final class search {
|
|||
int indexabstractContainercount = 0;
|
||||
int joincount = 0;
|
||||
plasmaSearchQuery theQuery = null;
|
||||
ArrayList<ResultEntry> accu = null;
|
||||
ArrayList<kelondroSortStack<ResultEntry>.stackElement> accu = null;
|
||||
plasmaSearchEvent theSearch = null;
|
||||
if ((query.length() == 0) && (abstractSet != null)) {
|
||||
// this is _not_ a normal search, only a request for index abstracts
|
||||
|
@ -258,10 +259,10 @@ public final class search {
|
|||
long timer = System.currentTimeMillis();
|
||||
StringBuffer links = new StringBuffer();
|
||||
String resource = null;
|
||||
plasmaSearchEvent.ResultEntry entry;
|
||||
kelondroSortStack<plasmaSearchEvent.ResultEntry>.stackElement entry;
|
||||
for (int i = 0; i < accu.size(); i++) {
|
||||
entry = (plasmaSearchEvent.ResultEntry) accu.get(i);
|
||||
resource = entry.resource();
|
||||
entry = accu.get(i);
|
||||
resource = entry.element.resource();
|
||||
if (resource != null) {
|
||||
links.append("resource").append(i).append('=').append(resource).append(serverCore.CRLF_STRING);
|
||||
}
|
||||
|
|
|
@ -86,4 +86,5 @@ public interface indexRWIEntry {
|
|||
|
||||
public boolean isOlder(indexRWIEntry other);
|
||||
|
||||
public int hashCode();
|
||||
}
|
|
@ -31,16 +31,14 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import de.anomic.kelondro.kelondroAbstractOrder;
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.kelondro.kelondroOrder;
|
||||
import de.anomic.plasma.plasmaCondenser;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess;
|
||||
import de.anomic.plasma.plasmaSearchRankingProfile;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry> implements kelondroOrder<indexRWIVarEntry> {
|
||||
public class indexRWIEntryOrder {
|
||||
private indexRWIVarEntry min, max;
|
||||
private plasmaSearchRankingProfile ranking;
|
||||
private kelondroMScoreCluster<String> doms; // collected for "authority" heuristic
|
||||
|
@ -69,8 +67,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
mmf0.start(); // fork here
|
||||
minmaxfinder mmf1 = new minmaxfinder(container, middle, container.size());
|
||||
mmf1.run(); // execute other fork in this thread
|
||||
if (this.min == null) this.min = mmf1.entryMin; else indexRWIVarEntry.min(this.min, mmf1.entryMin);
|
||||
if (this.max == null) this.max = mmf1.entryMax; else indexRWIVarEntry.max(this.max, mmf1.entryMax);
|
||||
if (this.min == null) this.min = mmf1.entryMin.clone(); else this.min.min(mmf1.entryMin);
|
||||
if (this.max == null) this.max = mmf1.entryMax.clone(); else this.max.max(mmf1.entryMax);
|
||||
Map.Entry<String, Integer> entry;
|
||||
Iterator<Map.Entry<String, Integer>> di = mmf1.domcount().entrySet().iterator();
|
||||
while (di.hasNext()) {
|
||||
|
@ -78,8 +76,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
|
||||
}
|
||||
try {mmf0.join();} catch (InterruptedException e) {} // wait for fork thread to finish
|
||||
if (this.min == null) this.min = mmf0.entryMin; else indexRWIVarEntry.min(this.min, mmf0.entryMin);
|
||||
if (this.max == null) this.max = mmf0.entryMax; else indexRWIVarEntry.max(this.max, mmf0.entryMax);
|
||||
if (this.min == null) this.min = mmf0.entryMin.clone(); else this.min.min(mmf0.entryMin);
|
||||
if (this.max == null) this.max = mmf0.entryMax.clone(); else this.max.max(mmf0.entryMax);
|
||||
di = mmf0.domcount().entrySet().iterator();
|
||||
while (di.hasNext()) {
|
||||
entry = di.next();
|
||||
|
@ -93,8 +91,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
// run minmax in one thread
|
||||
minmaxfinder mmf = new minmaxfinder(container, 0, container.size());
|
||||
mmf.run(); // execute without multi-threading
|
||||
if (this.min == null) this.min = mmf.entryMin; else indexRWIVarEntry.min(this.min, mmf.entryMin);
|
||||
if (this.max == null) this.max = mmf.entryMax; else indexRWIVarEntry.max(this.max, mmf.entryMax);
|
||||
if (this.min == null) this.min = mmf.entryMin.clone(); else this.min.min(mmf.entryMin);
|
||||
if (this.max == null) this.max = mmf.entryMax.clone(); else this.max.max(mmf.entryMax);
|
||||
Map.Entry<String, Integer> entry;
|
||||
Iterator<Map.Entry<String, Integer>> di = mmf.domcount().entrySet().iterator();
|
||||
while (di.hasNext()) {
|
||||
|
@ -109,44 +107,34 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
return result;
|
||||
}
|
||||
|
||||
public kelondroOrder<indexRWIVarEntry> clone() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int authority(String urlHash) {
|
||||
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
|
||||
}
|
||||
|
||||
public long cardinal(byte[] key) {
|
||||
return cardinal(new indexRWIVarEntry(new indexRWIRowEntry(key)));
|
||||
}
|
||||
|
||||
public long cardinal(indexRWIRowEntry t) {
|
||||
return cardinal(new indexRWIVarEntry(t));
|
||||
}
|
||||
|
||||
public long cardinal(indexRWIVarEntry t) {
|
||||
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
|
||||
// the normalizedEntry must be a normalized indexEntry
|
||||
kelondroBitfield flags = t.flags();
|
||||
long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
|
||||
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
|
||||
long r =
|
||||
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
|
||||
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
|
||||
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
|
||||
+ ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps))
|
||||
+ ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength))
|
||||
+ ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext))
|
||||
+ ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase))
|
||||
+ ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase))
|
||||
+ ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
|
||||
+ ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
|
||||
+ ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
|
||||
+ ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
|
||||
+ ( (((t.phrasesintext()- min.phrasesintext() ) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
|
||||
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
|
||||
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
|
||||
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
|
||||
+ (((int)((((t.termFrequency()- min.termFrequency() )*256.0)/ (1 + max.termFrequency()- min.termFrequency()))))<< ranking.coeff_termfrequency)
|
||||
+ ( authority(t.urlHash()) << ranking.coeff_authority)
|
||||
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
|
||||
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
|
||||
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
|
||||
+ ((max.posofphrase() == min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
|
||||
+ ((max.posinphrase() == min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
|
||||
+ ((max.worddistance() == min.worddistance()) ? 0 : (256 - (((t.worddistance() - min.worddistance() ) << 8) / (max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
|
||||
+ ((max.virtualAge() == min.virtualAge()) ? 0 : (((t.virtualAge() - min.virtualAge() ) << 8) / (max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
|
||||
+ ((max.wordsintitle() == min.wordsintitle()) ? 0 : (((t.wordsintitle() - min.wordsintitle() ) << 8) / (max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
|
||||
+ ((max.wordsintext() == min.wordsintext()) ? 0 : (((t.wordsintext() - min.wordsintext() ) << 8) / (max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
|
||||
+ ((max.phrasesintext() == min.phrasesintext()) ? 0 : (((t.phrasesintext()- min.phrasesintext() ) << 8) / (max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
|
||||
+ ((max.llocal() == min.llocal()) ? 0 : (((t.llocal() - min.llocal() ) << 8) / (max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
|
||||
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
|
||||
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
|
||||
+ tf
|
||||
+ (authority(t.urlHash()) << ranking.coeff_authority)
|
||||
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
|
||||
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0))
|
||||
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0))
|
||||
|
@ -163,20 +151,6 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
|
||||
return Long.MAX_VALUE - r; // returns a reversed number: the lower the number the better the ranking. This is used for simple sorting with a TreeMap
|
||||
}
|
||||
|
||||
public int compare(indexRWIVarEntry a, indexRWIVarEntry b) {
|
||||
long ca = cardinal(a);
|
||||
long cb = cardinal(b);
|
||||
return (ca > cb) ? 1 : (ca < cb) ? -1 : 0;
|
||||
}
|
||||
|
||||
public String signature() {
|
||||
return "rx";
|
||||
}
|
||||
|
||||
public boolean wellformed(indexRWIVarEntry a) {
|
||||
return true;
|
||||
}
|
||||
|
||||
public static class minmaxfinder extends Thread {
|
||||
|
||||
|
@ -208,8 +182,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder<indexRWIVarEntry>
|
|||
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++)));
|
||||
this.decodedEntries.add(iEntry);
|
||||
// find min/max
|
||||
if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry);
|
||||
if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry);
|
||||
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
|
||||
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
|
||||
// update domcount
|
||||
dom = iEntry.urlHash().substring(6);
|
||||
count = (Integer) doms.get(dom);
|
||||
|
|
|
@ -269,4 +269,7 @@ public final class indexRWIRowEntry implements indexRWIEntry {
|
|||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.urlHash().hashCode();
|
||||
}
|
||||
}
|
|
@ -27,6 +27,7 @@
|
|||
package de.anomic.index;
|
||||
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
|
||||
public class indexRWIVarEntry implements indexRWIEntry {
|
||||
|
||||
|
@ -40,7 +41,52 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
worddistance, wordsintext, wordsintitle;
|
||||
public double termFrequency;
|
||||
|
||||
public indexRWIVarEntry(indexRWIEntry e) {
|
||||
public indexRWIVarEntry(String urlHash,
|
||||
int urlLength, // byte-length of complete URL
|
||||
int urlComps, // number of path components
|
||||
int titleLength, // length of description/length (longer are better?)
|
||||
int hitcount, // how often appears this word in the text
|
||||
int wordcount, // total number of words
|
||||
int phrasecount, // total number of phrases
|
||||
int posintext, // position of word in all words
|
||||
int posinphrase, // position of word in its phrase
|
||||
int posofphrase, // number of the phrase where word appears
|
||||
long lastmodified, // last-modified time of the document where word appears
|
||||
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
String language, // (guessed) language of document
|
||||
char doctype, // type of document
|
||||
int outlinksSame, // outlinks to same domain
|
||||
int outlinksOther, // outlinks to other domain
|
||||
kelondroBitfield flags, // attributes to the url and to the word according the url
|
||||
int worddistance,
|
||||
double termfrequency
|
||||
) {
|
||||
if ((language == null) || (language.length() != 2)) language = "uk";
|
||||
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
|
||||
int mddct = plasmaWordIndex.microDateDays(updatetime);
|
||||
this.flags = flags;
|
||||
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
|
||||
this.lastModified = lastmodified;
|
||||
this.language = language;
|
||||
this.urlHash = urlHash;
|
||||
this.type = doctype;
|
||||
this.hitcount = hitcount;
|
||||
this.llocal = outlinksSame;
|
||||
this.lother = outlinksOther;
|
||||
this.phrasesintext = outlinksOther;
|
||||
this.posintext = posintext;
|
||||
this.posinphrase = posinphrase;
|
||||
this.posofphrase = posofphrase;
|
||||
this.urlcomps = urlComps;
|
||||
this.urllength = urlLength;
|
||||
this.virtualAge = mddlm;
|
||||
this.worddistance = worddistance;
|
||||
this.wordsintext = wordcount;
|
||||
this.wordsintitle = titleLength;
|
||||
this.termFrequency = termfrequency;
|
||||
}
|
||||
|
||||
public indexRWIVarEntry(indexRWIRowEntry e) {
|
||||
this.flags = e.flags();
|
||||
this.freshUntil = e.freshUntil();
|
||||
this.lastModified = e.lastModified();
|
||||
|
@ -60,18 +106,43 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
this.worddistance = 0;
|
||||
this.wordsintext = e.wordsintext();
|
||||
this.wordsintitle = e.wordsintitle();
|
||||
this.termFrequency = 0.0;
|
||||
this.termFrequency = e.termFrequency();
|
||||
}
|
||||
|
||||
public indexRWIVarEntry clone() {
|
||||
indexRWIVarEntry c = new indexRWIVarEntry(
|
||||
this.urlHash,
|
||||
this.urllength,
|
||||
this.urlcomps,
|
||||
this.wordsintitle,
|
||||
this.hitcount,
|
||||
this.wordsintext,
|
||||
this.phrasesintext,
|
||||
this.posintext,
|
||||
this.posinphrase,
|
||||
this.posofphrase,
|
||||
this.lastModified,
|
||||
System.currentTimeMillis(),
|
||||
this.language,
|
||||
this.type,
|
||||
this.llocal,
|
||||
this.lother,
|
||||
this.flags,
|
||||
this.worddistance,
|
||||
this.termFrequency);
|
||||
return c;
|
||||
}
|
||||
|
||||
public void join(indexRWIVarEntry oe) {
|
||||
// combine the distance
|
||||
this.worddistance = this.worddistance() + oe.worddistance() + Math.abs(this.posintext() - oe.posintext());
|
||||
this.posintext = Math.min(this.posintext(), oe.posintext());
|
||||
this.posinphrase = (this.posofphrase() == oe.posofphrase()) ? Math.min(this.posinphrase(), oe.posinphrase()) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase(), oe.posofphrase());
|
||||
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
|
||||
this.posintext = Math.min(this.posintext, oe.posintext);
|
||||
this.posinphrase = (this.posofphrase == oe.posofphrase) ? Math.min(this.posinphrase, oe.posinphrase) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
|
||||
|
||||
// combine term frequency
|
||||
this.wordsintext = this.wordsintext() + oe.wordsintext();
|
||||
this.wordsintext = this.wordsintext + oe.wordsintext;
|
||||
this.termFrequency = this.termFrequency + oe.termFrequency;
|
||||
}
|
||||
|
||||
public kelondroBitfield flags() {
|
||||
|
@ -191,66 +262,65 @@ public class indexRWIVarEntry implements indexRWIEntry {
|
|||
return this.termFrequency;
|
||||
}
|
||||
|
||||
public static final void min(indexRWIVarEntry t, indexRWIVarEntry other) {
|
||||
public final void min(indexRWIVarEntry other) {
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
||||
if (t.hitcount() > (v = other.hitcount())) t.hitcount = v;
|
||||
if (t.llocal() > (v = other.llocal())) t.llocal = v;
|
||||
if (t.lother() > (v = other.lother())) t.lother = v;
|
||||
if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v;
|
||||
if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v;
|
||||
if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v;
|
||||
if (t.posintext() > (v = other.posintext())) t.posintext = v;
|
||||
if (t.posinphrase() > (v = other.posinphrase())) t.posinphrase = v;
|
||||
if (t.posofphrase() > (v = other.posofphrase())) t.posofphrase = v;
|
||||
if (t.worddistance() > (v = other.worddistance())) t.worddistance = v;
|
||||
if (t.lastModified() > (w = other.lastModified())) t.lastModified = w;
|
||||
if (t.freshUntil() > (w = other.freshUntil())) t.freshUntil = w;
|
||||
if (t.urllength() > (v = other.urllength())) t.urllength = v;
|
||||
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v;
|
||||
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v;
|
||||
if (t.termFrequency > (d = other.termFrequency())) t.termFrequency = d;
|
||||
if (this.hitcount > (v = other.hitcount)) this.hitcount = v;
|
||||
if (this.llocal > (v = other.llocal)) this.llocal = v;
|
||||
if (this.lother > (v = other.lother)) this.lother = v;
|
||||
if (this.virtualAge > (v = other.virtualAge)) this.virtualAge = v;
|
||||
if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
|
||||
if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
|
||||
if (this.posintext > (v = other.posintext)) this.posintext = v;
|
||||
if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v;
|
||||
if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.worddistance > (v = other.worddistance)) this.worddistance = v;
|
||||
if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
|
||||
if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength > (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public static final void max(indexRWIVarEntry t, indexRWIVarEntry other) {
|
||||
public final void max(indexRWIVarEntry other) {
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
||||
if (t.hitcount() < (v = other.hitcount())) t.hitcount = v;
|
||||
if (t.llocal() < (v = other.llocal())) t.llocal = v;
|
||||
if (t.lother() < (v = other.lother())) t.lother = v;
|
||||
if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v;
|
||||
if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v;
|
||||
if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v;
|
||||
if (t.posintext() < (v = other.posintext())) t.posintext = v;
|
||||
if (t.posinphrase() < (v = other.posinphrase())) t.posinphrase = v;
|
||||
if (t.posofphrase() < (v = other.posofphrase())) t.posofphrase = v;
|
||||
if (t.worddistance() < (v = other.worddistance())) t.worddistance = v;
|
||||
if (t.lastModified() < (w = other.lastModified())) t.lastModified = w;
|
||||
if (t.freshUntil() < (w = other.freshUntil())) t.freshUntil = w;
|
||||
if (t.urllength() < (v = other.urllength())) t.urllength = v;
|
||||
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v;
|
||||
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v;
|
||||
if (t.termFrequency < (d = other.termFrequency())) t.termFrequency = d;
|
||||
if (this.hitcount < (v = other.hitcount)) this.hitcount = v;
|
||||
if (this.llocal < (v = other.llocal)) this.llocal = v;
|
||||
if (this.lother < (v = other.lother)) this.lother = v;
|
||||
if (this.virtualAge < (v = other.virtualAge)) this.virtualAge = v;
|
||||
if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
|
||||
if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
|
||||
if (this.posintext < (v = other.posintext)) this.posintext = v;
|
||||
if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v;
|
||||
if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.worddistance < (v = other.worddistance)) this.worddistance = v;
|
||||
if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
|
||||
if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength < (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public static void join(indexRWIVarEntry ie1, indexRWIEntry ie2) {
|
||||
// returns a modified entry of the first argument
|
||||
public void join(indexRWIEntry oe) {
|
||||
// joins two entries into one entry
|
||||
|
||||
// combine the distance
|
||||
ie1.worddistance = ie1.worddistance + ((ie2 instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) ie2).worddistance() : 0) + Math.abs(ie1.posintext() - ie2.posintext());
|
||||
ie1.posintext = Math.min(ie1.posintext(), ie2.posintext());
|
||||
ie1.posinphrase = (ie1.posofphrase() == ie2.posofphrase()) ? Math.min(ie1.posinphrase(), ie2.posinphrase()) : 0;
|
||||
ie1.posofphrase = Math.min(ie1.posofphrase(), ie2.posofphrase());
|
||||
this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext());
|
||||
this.posintext = Math.min(this.posintext, oe.posintext());
|
||||
this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
|
||||
|
||||
// combine term frequency
|
||||
ie1.termFrequency = ie1.termFrequency + ie2.termFrequency();
|
||||
ie1.wordsintext = ie1.wordsintext() + ie2.wordsintext();
|
||||
}
|
||||
|
||||
public void join(indexRWIEntry oe) {
|
||||
join(this, oe);
|
||||
this.termFrequency = this.termFrequency + oe.termFrequency();
|
||||
this.wordsintext = this.wordsintext + oe.wordsintext();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.urlHash.hashCode();
|
||||
}
|
||||
}
|
||||
|
|
147
source/de/anomic/kelondro/kelondroSortStack.java
Normal file
147
source/de/anomic/kelondro/kelondroSortStack.java
Normal file
|
@ -0,0 +1,147 @@
|
|||
// kelondroSortStack.java
|
||||
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 20.02.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class kelondroSortStack<E> {
|
||||
|
||||
// implements a stack where elements 'float' on-top of the stack according to a weight value.
|
||||
// objects pushed on the stack must implement the hashCode() method to provide a handle
|
||||
// for a double-check.
|
||||
|
||||
protected TreeMap<Long, E> onstack; // object within the stack
|
||||
protected HashSet<Integer> instack; // keeps track which element has been on the stack or is now in the offstack
|
||||
protected int maxsize;
|
||||
|
||||
public kelondroSortStack(int maxsize) {
|
||||
// the maxsize is the maximum number of entries in the stack
|
||||
// if this is set to -1, the size is unlimited
|
||||
this.onstack = new TreeMap<Long, E>();
|
||||
this.instack = new HashSet<Integer>();
|
||||
this.maxsize = maxsize;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return this.onstack.size();
|
||||
}
|
||||
|
||||
public synchronized void push(stackElement se) {
|
||||
push(se.element, se.weight);
|
||||
}
|
||||
|
||||
public synchronized void push(E element, long weight) {
|
||||
if (exists(element)) return;
|
||||
|
||||
// manipulate weight in such a way that it has no conflicts
|
||||
Long w = new Long(weight);
|
||||
while (this.onstack.containsKey(w)) w = new Long(w.longValue() + 1);
|
||||
|
||||
// put the element on the stack
|
||||
this.onstack.put(w, element);
|
||||
|
||||
// register it for double-check
|
||||
this.instack.add(element.hashCode());
|
||||
|
||||
// check maximum size of the stack an remove elements if the stack gets too large
|
||||
if (this.maxsize <= 0) return;
|
||||
while ((this.onstack.size() > 0) && (this.onstack.size() > this.maxsize)) {
|
||||
this.onstack.remove(this.onstack.lastKey());
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized stackElement top() {
|
||||
// returns the element that is currently on top of the stack
|
||||
if (this.onstack.size() == 0) return null;
|
||||
Long w = this.onstack.firstKey();
|
||||
E element = this.onstack.get(w);
|
||||
return new stackElement(element, w.longValue());
|
||||
}
|
||||
|
||||
public synchronized stackElement pop() {
|
||||
// returns the element that is currently on top of the stack
|
||||
// it is removed and added to the offstack list
|
||||
// this is exactly the same as element(offstack.size())
|
||||
if (this.onstack.size() == 0) return null;
|
||||
Long w = this.onstack.firstKey();
|
||||
E element = this.onstack.remove(w);
|
||||
stackElement se = new stackElement(element, w.longValue());
|
||||
return se;
|
||||
}
|
||||
|
||||
public boolean exists(E element) {
|
||||
// uses the hashCode of the element to find out of the element had been on the list or the stack
|
||||
return this.instack.contains(element.hashCode());
|
||||
}
|
||||
|
||||
public boolean exists(int hashcode) {
|
||||
// uses the hashCode of the element to find out of the element had been on the list or the stack
|
||||
return this.instack.contains(hashcode);
|
||||
}
|
||||
|
||||
public stackElement get(int hashcode) {
|
||||
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
|
||||
Map.Entry<Long, E> entry;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
if (entry.getValue().hashCode() == hashcode) return new stackElement(entry.getValue(), entry.getKey().longValue());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public stackElement remove(int hashcode) {
|
||||
Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
|
||||
Map.Entry<Long, E> entry;
|
||||
stackElement se;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
if (entry.getValue().hashCode() == hashcode) {
|
||||
se = new stackElement(entry.getValue(), entry.getKey().longValue());
|
||||
this.onstack.remove(se.weight);
|
||||
return se;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean bottom(long weight) {
|
||||
// returns true if the element with that weight would be on the bottom of the stack after inserting
|
||||
return weight > this.onstack.lastKey().longValue();
|
||||
}
|
||||
|
||||
public class stackElement {
|
||||
public long weight;
|
||||
public E element;
|
||||
public stackElement(E element, long weight) {
|
||||
this.element = element;
|
||||
this.weight = weight;
|
||||
}
|
||||
}
|
||||
}
|
135
source/de/anomic/kelondro/kelondroSortStore.java
Normal file
135
source/de/anomic/kelondro/kelondroSortStore.java
Normal file
|
@ -0,0 +1,135 @@
|
|||
// kelondroSortStore.java
|
||||
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 20.02.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class kelondroSortStore<E> extends kelondroSortStack<E> {
|
||||
|
||||
// extends the sortStack in such a way that it adds a list where objects, that had
|
||||
// been pulled from the stack with pop are listed. Provides access methods to address
|
||||
// specific elements in the list.
|
||||
|
||||
private ArrayList<stackElement> offstack; // objects that had been on the stack but had been removed
|
||||
|
||||
public kelondroSortStore(int maxsize) {
|
||||
super(maxsize);
|
||||
this.offstack = new ArrayList<stackElement>();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return super.onstack.size() + this.offstack.size();
|
||||
}
|
||||
|
||||
public int sizeStore() {
|
||||
return this.offstack.size();
|
||||
}
|
||||
|
||||
public synchronized void push(E element, long weight) {
|
||||
super.push(element, weight);
|
||||
if (this.maxsize <= 0) return;
|
||||
while ((this.onstack.size() > 0) && (super.onstack.size() + this.offstack.size() > this.maxsize)) {
|
||||
this.onstack.remove(this.onstack.lastKey());
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized stackElement pop() {
|
||||
// returns the element that is currently on top of the stack
|
||||
// it is removed and added to the offstack list
|
||||
// this is exactly the same as element(offstack.size())
|
||||
stackElement se = super.pop();
|
||||
if (se == null) return null;
|
||||
this.offstack.add(se);
|
||||
return se;
|
||||
}
|
||||
|
||||
public synchronized stackElement element(int position) {
|
||||
// returns an element from a specific position. It is either taken from the offstack,
|
||||
// or removed from the onstack.
|
||||
// The offstack will grow if elements are not from the offstack and present at the onstack.
|
||||
if (position < this.offstack.size()) {
|
||||
return this.offstack.get(position);
|
||||
}
|
||||
if (position >= size()) return null; // we don't have that element
|
||||
while (position >= this.offstack.size()) {
|
||||
Long w = this.onstack.firstKey();
|
||||
E element = this.onstack.remove(w);
|
||||
stackElement se = new stackElement(element, w.longValue());
|
||||
this.offstack.add(se);
|
||||
}
|
||||
return this.offstack.get(position);
|
||||
}
|
||||
|
||||
public ArrayList<stackElement> list(int count) {
|
||||
// returns the specific amount of entries. If they are not yet present in the offstack, they are shifted there from the onstack
|
||||
// if count is < 0 then all elements are taken
|
||||
// the returned list is not cloned from the internal list and shall not be modified in any way (read-only)
|
||||
if (count < 0) {
|
||||
// shift all elements
|
||||
while (this.onstack.size() > 0) {
|
||||
Long w = this.onstack.firstKey();
|
||||
E element = this.onstack.remove(w);
|
||||
stackElement se = new stackElement(element, w.longValue());
|
||||
this.offstack.add(se);
|
||||
}
|
||||
return this.offstack;
|
||||
}
|
||||
if (size() < count) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + size() + ")");
|
||||
while (this.onstack.size() < count) {
|
||||
Long w = this.onstack.firstKey();
|
||||
E element = this.onstack.remove(w);
|
||||
stackElement se = new stackElement(element, w.longValue());
|
||||
this.offstack.add(se);
|
||||
}
|
||||
return this.offstack;
|
||||
}
|
||||
|
||||
public stackElement get(int hashcode) {
|
||||
stackElement se = super.get(hashcode);
|
||||
if (se != null) return se;
|
||||
Iterator<stackElement> j = this.offstack.iterator();
|
||||
while (j.hasNext()) {
|
||||
se = j.next();
|
||||
if (se.element.hashCode() == hashcode) return se;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public stackElement remove(int hashcode) {
|
||||
stackElement se = super.remove(hashcode);
|
||||
if (se != null) return se;
|
||||
for (int j = 0; j < this.offstack.size(); j++) {
|
||||
se = this.offstack.get(j);
|
||||
if (se.element.hashCode() == hashcode) {
|
||||
this.offstack.remove(j);
|
||||
return se;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -38,9 +38,12 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.index.indexContainer;
|
||||
import de.anomic.index.indexRWIEntry;
|
||||
import de.anomic.index.indexRWIVarEntry;
|
||||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.kelondro.kelondroMSetTools;
|
||||
import de.anomic.kelondro.kelondroSortStack;
|
||||
import de.anomic.kelondro.kelondroSortStore;
|
||||
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
|
||||
import de.anomic.server.serverProfiling;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
@ -77,8 +80,7 @@ public final class plasmaSearchEvent {
|
|||
public TreeMap<String, Integer> IACount;
|
||||
public String IAmaxcounthash, IAneardhthash;
|
||||
private resultWorker[] workerThreads;
|
||||
private ArrayList<ResultEntry> resultList;
|
||||
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
|
||||
private kelondroSortStore<ResultEntry> result;
|
||||
private HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
|
||||
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
|
||||
private long urlRetrievalAllTime;
|
||||
|
@ -104,8 +106,7 @@ public final class plasmaSearchEvent {
|
|||
this.snippetComputationAllTime = 0;
|
||||
this.workerThreads = null;
|
||||
this.localSearchThread = null;
|
||||
this.resultList = new ArrayList<ResultEntry>(10); // this is the result set which is filled up with search results, enriched with snippets
|
||||
//this.resultListLock = 0; // no locked elements until now
|
||||
this.result = new kelondroSortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
|
||||
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
|
||||
|
||||
// snippets do not need to match with the complete query hashes,
|
||||
|
@ -202,7 +203,7 @@ public final class plasmaSearchEvent {
|
|||
ResultEntry resultEntry;
|
||||
yacyURL url;
|
||||
synchronized (rankedCache) {
|
||||
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (resultList.size() < (query.neededResults()))) {
|
||||
while ((rankedCache.size() > 0) && ((uentry = rankedCache.bestURL(true)) != null) && (result.size() < (query.neededResults()))) {
|
||||
url = uentry.comp().url();
|
||||
if (url == null) continue;
|
||||
//System.out.println("***DEBUG*** SEARCH RESULT URL=" + url.toNormalform(false, false));
|
||||
|
@ -213,9 +214,7 @@ public final class plasmaSearchEvent {
|
|||
snippetComputationAllTime += resultEntry.snippetComputationTime;
|
||||
|
||||
// place the result to the result vector
|
||||
synchronized (resultList) {
|
||||
resultList.add(resultEntry);
|
||||
}
|
||||
result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
|
||||
|
||||
// add references
|
||||
synchronized (rankedCache) {
|
||||
|
@ -223,7 +222,7 @@ public final class plasmaSearchEvent {
|
|||
}
|
||||
}
|
||||
}
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", resultList.size(), System.currentTimeMillis() - timer));
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "offline snippet fetch", result.size(), System.currentTimeMillis() - timer));
|
||||
}
|
||||
|
||||
// clean up events
|
||||
|
@ -466,8 +465,8 @@ public final class plasmaSearchEvent {
|
|||
// if worker threads had been alive, but did not succeed, start them again to fetch missing links
|
||||
if ((query.onlineSnippetFetch) &&
|
||||
(!event.anyWorkerAlive()) &&
|
||||
(event.resultList.size() < query.neededResults() + 10) &&
|
||||
((event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize()) > event.resultList.size())) {
|
||||
(event.result.size() < query.neededResults() + 10) &&
|
||||
(event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
|
||||
// set new timeout
|
||||
event.eventTime = System.currentTimeMillis();
|
||||
// start worker threads to fetch urls and snippets
|
||||
|
@ -508,7 +507,7 @@ public final class plasmaSearchEvent {
|
|||
while (System.currentTimeMillis() < this.timeout) {
|
||||
this.lastLifeSign = System.currentTimeMillis();
|
||||
|
||||
if (resultList.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough
|
||||
if (result.size() >= query.neededResults() /*+ query.displayResults()*/) break; // we have enough
|
||||
|
||||
// get next entry
|
||||
page = rankedCache.bestURL(true);
|
||||
|
@ -531,21 +530,8 @@ public final class plasmaSearchEvent {
|
|||
//System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
|
||||
|
||||
// place the result to the result vector
|
||||
boolean d = false;
|
||||
synchronized (resultList) {
|
||||
doublecheck: for (int i = 0; i < resultList.size(); i++) {
|
||||
if (resultList.get(i).urlcomps.url().hash().equals(resultEntry.urlcomps.url().hash())) {
|
||||
d = true;
|
||||
break doublecheck;
|
||||
}
|
||||
}
|
||||
if (!d) {
|
||||
resultList.add(resultEntry);
|
||||
}
|
||||
}
|
||||
|
||||
// add references
|
||||
if (!d) synchronized (rankedCache) {
|
||||
if (!result.exists(resultEntry)) {
|
||||
result.push(resultEntry, rankedCache.getOrder().cardinal(resultEntry.word()));
|
||||
rankedCache.addReferences(resultEntry);
|
||||
}
|
||||
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
|
||||
|
@ -554,10 +540,7 @@ public final class plasmaSearchEvent {
|
|||
}
|
||||
|
||||
private boolean anyResultWith(String urlhash) {
|
||||
for (int i = 0; i < resultList.size(); i++) {
|
||||
if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true;
|
||||
}
|
||||
return false;
|
||||
return result.exists(urlhash.hashCode());
|
||||
}
|
||||
|
||||
private boolean anyFailureWith(String urlhash) {
|
||||
|
@ -576,6 +559,11 @@ public final class plasmaSearchEvent {
|
|||
|
||||
public ResultEntry oneResult(int item) {
|
||||
// first sleep a while to give accumulation threads a chance to work
|
||||
if (this.result.sizeStore() > item) {
|
||||
// we have the wanted result already in the result array .. return that
|
||||
return this.result.element(item).element;
|
||||
}
|
||||
|
||||
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
|
||||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
|
||||
// this is a search using remote search threads. Also the local search thread is started as background process
|
||||
|
@ -586,45 +574,28 @@ public final class plasmaSearchEvent {
|
|||
}
|
||||
// now wait until as many remote worker threads have finished, as we want to display results
|
||||
while ((this.primarySearchThreads != null) && (this.primarySearchThreads.length > item) && (anyWorkerAlive()) &&
|
||||
((this.resultList.size() <= item) || (countFinishedRemoteSearch() <= item))) {
|
||||
((result.size() <= item) || (countFinishedRemoteSearch() <= item))) {
|
||||
try {Thread.sleep(100);} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
}
|
||||
// finally wait until enough results are there produced from the snippet fetch process
|
||||
while ((anyWorkerAlive()) && (this.resultList.size() <= item)) {
|
||||
while ((anyWorkerAlive()) && (result.size() <= item)) {
|
||||
try {Thread.sleep(100);} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
// finally, if there is something, return the result
|
||||
synchronized (this.resultList) {
|
||||
// check if we have enough entries
|
||||
if (this.resultList.size() <= item) return null;
|
||||
|
||||
// fetch the best entry from the resultList, not the entry from item position
|
||||
// whenever a specific entry was switched in its position and was returned here
|
||||
// a moving pointer is set to assign that item position as not changeable
|
||||
int bestpick = item; //postRankingFavourite(item);
|
||||
if (bestpick != item) {
|
||||
// switch the elements
|
||||
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
|
||||
serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
|
||||
this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
|
||||
this.resultList.set(item, buf);
|
||||
}
|
||||
|
||||
//this.resultListLock = item; // lock the element; be prepared to return it
|
||||
return (ResultEntry) this.resultList.get(item);
|
||||
}
|
||||
if (this.result.size() <= item) return null;
|
||||
return this.result.element(item).element;
|
||||
}
|
||||
|
||||
public ArrayList<ResultEntry> completeResults(long waitingtime) {
|
||||
|
||||
public ArrayList<kelondroSortStack<ResultEntry>.stackElement> completeResults(long waitingtime) {
|
||||
long timeout = System.currentTimeMillis() + waitingtime;
|
||||
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
|
||||
while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
|
||||
try {Thread.sleep(100);} catch (InterruptedException e) {}
|
||||
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
|
||||
}
|
||||
return this.resultList;
|
||||
return this.result.list(this.result.size());
|
||||
}
|
||||
|
||||
boolean secondarySearchStartet = false;
|
||||
|
@ -789,7 +760,9 @@ public final class plasmaSearchEvent {
|
|||
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
|
||||
}
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return urlentry.hash().hashCode();
|
||||
}
|
||||
public String hash() {
|
||||
return urlentry.hash();
|
||||
}
|
||||
|
@ -832,8 +805,10 @@ public final class plasmaSearchEvent {
|
|||
public int lapp() {
|
||||
return urlentry.lapp();
|
||||
}
|
||||
public indexRWIEntry word() {
|
||||
return urlentry.word();
|
||||
public indexRWIVarEntry word() {
|
||||
indexRWIEntry word = urlentry.word();
|
||||
assert word instanceof indexRWIVarEntry;
|
||||
return (indexRWIVarEntry) word;
|
||||
}
|
||||
public boolean hasTextSnippet() {
|
||||
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);
|
||||
|
|
|
@ -33,7 +33,6 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
|
@ -45,6 +44,7 @@ import de.anomic.index.indexRWIVarEntry;
|
|||
import de.anomic.index.indexURLEntry;
|
||||
import de.anomic.kelondro.kelondroBinSearch;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.kelondro.kelondroSortStack;
|
||||
import de.anomic.server.serverCodings;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverProfiling;
|
||||
|
@ -54,15 +54,15 @@ public final class plasmaSearchRankingProcess {
|
|||
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
|
||||
private static boolean useYBR = true;
|
||||
|
||||
private TreeMap<Object, indexRWIVarEntry> sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String
|
||||
private HashMap<String, TreeMap<Object, indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
|
||||
private kelondroSortStack<indexRWIVarEntry> stack;
|
||||
private HashMap<String, kelondroSortStack<indexRWIVarEntry>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
||||
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
|
||||
private plasmaSearchQuery query;
|
||||
private int sortorder;
|
||||
private int maxentries;
|
||||
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
|
||||
private indexRWIEntryOrder order;
|
||||
private HashMap<String, Object> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
|
||||
private HashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
|
||||
private kelondroMScoreCluster<String> ref; // reference score computation for the commonSense heuristic
|
||||
private int[] flagcount; // flag counter
|
||||
private TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
|
||||
|
@ -74,17 +74,17 @@ public final class plasmaSearchRankingProcess {
|
|||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
||||
this.localSearchContainerMaps = null;
|
||||
this.sortedRWIEntries = new TreeMap<Object, indexRWIVarEntry>();
|
||||
this.doubleDomCache = new HashMap<String, TreeMap<Object, indexRWIVarEntry>>();
|
||||
this.stack = new kelondroSortStack<indexRWIVarEntry>(maxentries);
|
||||
this.doubleDomCache = new HashMap<String, kelondroSortStack<indexRWIVarEntry>>();
|
||||
this.handover = new HashMap<String, String>();
|
||||
this.order = null;
|
||||
this.order = (query == null) ? null : new indexRWIEntryOrder(query.ranking);
|
||||
this.query = query;
|
||||
this.maxentries = maxentries;
|
||||
this.remote_peerCount = 0;
|
||||
this.remote_indexCount = 0;
|
||||
this.remote_resourceSize = 0;
|
||||
this.local_resourceSize = 0;
|
||||
this.urlhashes = new HashMap<String, Object>();
|
||||
this.urlhashes = new HashMap<String, Integer>();
|
||||
this.ref = new kelondroMScoreCluster<String>();
|
||||
this.misses = new TreeSet<String>();
|
||||
this.wordIndex = wordIndex;
|
||||
|
@ -93,6 +93,10 @@ public final class plasmaSearchRankingProcess {
|
|||
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
||||
}
|
||||
|
||||
public long ranking(indexRWIVarEntry word) {
|
||||
return order.cardinal(word);
|
||||
}
|
||||
|
||||
public void execQuery() {
|
||||
|
||||
long timer = System.currentTimeMillis();
|
||||
|
@ -150,21 +154,21 @@ public final class plasmaSearchRankingProcess {
|
|||
|
||||
// load url
|
||||
if (sortorder == 0) {
|
||||
this.sortedRWIEntries.put(ientry.urlHash(), ientry);
|
||||
this.urlhashes.put(ientry.urlHash(), ientry.urlHash());
|
||||
this.stack.push(ientry, ientry.urlHash().hashCode());
|
||||
this.urlhashes.put(ientry.urlHash(), ientry.urlHash().hashCode());
|
||||
} else {
|
||||
uentry = wordIndex.loadedURL.load(ientry.urlHash(), ientry, 0);
|
||||
if (uentry == null) {
|
||||
this.misses.add(ientry.urlHash());
|
||||
} else {
|
||||
u = uentry.comp().url().toNormalform(false, true);
|
||||
this.sortedRWIEntries.put(u, ientry);
|
||||
this.urlhashes.put(ientry.urlHash(), u);
|
||||
this.stack.push(ientry, u.hashCode());
|
||||
this.urlhashes.put(ientry.urlHash(), u.hashCode());
|
||||
}
|
||||
}
|
||||
|
||||
// interrupt if we have enough
|
||||
if ((query.neededResults() > 0) && (this.misses.size() + this.sortedRWIEntries.size() > query.neededResults())) break loop;
|
||||
if ((query.neededResults() > 0) && (this.misses.size() + this.stack.size() > query.neededResults())) break loop;
|
||||
} // end loop
|
||||
}
|
||||
|
||||
|
@ -182,22 +186,20 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
|
||||
long timer = System.currentTimeMillis();
|
||||
if (this.order == null) {
|
||||
this.order = new indexRWIEntryOrder(query.ranking);
|
||||
}
|
||||
|
||||
// normalize entries
|
||||
ArrayList<indexRWIVarEntry> decodedEntries = this.order.normalizeWith(index);
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer));
|
||||
|
||||
// normalize entries and get ranking
|
||||
// iterate over normalized entries and select some that are better than currently stored
|
||||
timer = System.currentTimeMillis();
|
||||
Iterator<indexRWIVarEntry> i = decodedEntries.iterator();
|
||||
indexRWIVarEntry iEntry, l;
|
||||
long biggestEntry = 0;
|
||||
//long s0 = System.currentTimeMillis();
|
||||
indexRWIVarEntry iEntry;
|
||||
Long r;
|
||||
while (i.hasNext()) {
|
||||
iEntry = i.next();
|
||||
if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
||||
assert (iEntry.urlHash().length() == index.row().primaryKeyLength);
|
||||
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
||||
|
||||
// increase flag counts
|
||||
for (int j = 0; j < 32; j++) {
|
||||
|
@ -206,31 +208,32 @@ public final class plasmaSearchRankingProcess {
|
|||
|
||||
// kick out entries that are too bad according to current findings
|
||||
r = new Long(order.cardinal(iEntry));
|
||||
if ((maxentries >= 0) && (sortedRWIEntries.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
|
||||
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
|
||||
|
||||
// check constraints
|
||||
if (!testFlags(iEntry)) continue;
|
||||
|
||||
// check document domain
|
||||
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
|
||||
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
|
||||
}
|
||||
if ((maxentries < 0) || (sortedRWIEntries.size() < maxentries)) {
|
||||
|
||||
// insert
|
||||
if ((maxentries < 0) || (stack.size() < maxentries)) {
|
||||
// in case that we don't have enough yet, accept any new entry
|
||||
if (urlhashes.containsKey(iEntry.urlHash())) continue;
|
||||
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
|
||||
sortedRWIEntries.put(r, iEntry);
|
||||
stack.push(iEntry, r);
|
||||
} else {
|
||||
if (r.longValue() > biggestEntry) {
|
||||
// if we already have enough entries, insert only such that are necessary to get a better result
|
||||
if (stack.bottom(r.longValue())) {
|
||||
continue;
|
||||
} else {
|
||||
// double-check
|
||||
if (urlhashes.containsKey(iEntry.urlHash())) continue;
|
||||
l = sortedRWIEntries.remove((Long) sortedRWIEntries.lastKey());
|
||||
urlhashes.remove(l.urlHash());
|
||||
while (sortedRWIEntries.containsKey(r)) r = new Long(r.longValue() + 1);
|
||||
sortedRWIEntries.put(r, iEntry);
|
||||
biggestEntry = order.cardinal(sortedRWIEntries.get(sortedRWIEntries.lastKey()));
|
||||
stack.push(iEntry, r);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -271,85 +274,69 @@ public final class plasmaSearchRankingProcess {
|
|||
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
|
||||
|
||||
|
||||
private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) {
|
||||
private synchronized kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
|
||||
// returns from the current RWI list the best entry and removed this entry from the list
|
||||
Object bestEntry;
|
||||
TreeMap<Object, indexRWIVarEntry> m;
|
||||
indexRWIVarEntry rwi;
|
||||
while (sortedRWIEntries.size() > 0) {
|
||||
bestEntry = sortedRWIEntries.firstKey();
|
||||
rwi = sortedRWIEntries.remove(bestEntry);
|
||||
if (!skipDoubleDom) return new Object[]{bestEntry, rwi};
|
||||
kelondroSortStack<indexRWIVarEntry> m;
|
||||
kelondroSortStack<indexRWIVarEntry>.stackElement rwi;
|
||||
while (stack.size() > 0) {
|
||||
rwi = stack.pop();
|
||||
if (!skipDoubleDom) return rwi;
|
||||
// check doubledom
|
||||
String domhash = rwi.urlHash().substring(6);
|
||||
String domhash = rwi.element.urlHash().substring(6);
|
||||
m = this.doubleDomCache.get(domhash);
|
||||
if (m == null) {
|
||||
// first appearance of dom
|
||||
m = new TreeMap<Object, indexRWIVarEntry>();
|
||||
m = new kelondroSortStack<indexRWIVarEntry>(-1);
|
||||
this.doubleDomCache.put(domhash, m);
|
||||
return new Object[]{bestEntry, rwi};
|
||||
return rwi;
|
||||
}
|
||||
// second appearances of dom
|
||||
m.put(bestEntry, rwi);
|
||||
m.push(rwi);
|
||||
}
|
||||
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
||||
// find best entry from all caches
|
||||
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
bestEntry = null;
|
||||
Object o;
|
||||
indexRWIVarEntry bestrwi = null;
|
||||
Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
kelondroSortStack<indexRWIVarEntry>.stackElement bestEntry = null;
|
||||
kelondroSortStack<indexRWIVarEntry>.stackElement o;
|
||||
while (i.hasNext()) {
|
||||
m = i.next();
|
||||
if (m.size() == 0) continue;
|
||||
if (bestEntry == null) {
|
||||
bestEntry = m.firstKey();
|
||||
bestrwi = m.remove(bestEntry);
|
||||
bestEntry = m.top();
|
||||
continue;
|
||||
}
|
||||
o = m.firstKey();
|
||||
rwi = m.remove(o);
|
||||
if (o instanceof Long) {
|
||||
if (((Long) o).longValue() < ((Long) bestEntry).longValue()) {
|
||||
bestEntry = o;
|
||||
bestrwi = rwi;
|
||||
}
|
||||
}
|
||||
if (o instanceof String) {
|
||||
if (((String) o).compareTo((String) bestEntry) < 0) {
|
||||
bestEntry = o;
|
||||
bestrwi = rwi;
|
||||
}
|
||||
o = m.top();
|
||||
if (o.weight < bestEntry.weight) {
|
||||
bestEntry = o;
|
||||
}
|
||||
}
|
||||
if (bestrwi == null) return null;
|
||||
if (bestEntry == null) return null;
|
||||
// finally remove the best entry from the doubledom cache
|
||||
m = this.doubleDomCache.get(bestrwi.urlHash().substring(6));
|
||||
m.remove(bestEntry);
|
||||
return new Object[]{bestEntry, bestrwi};
|
||||
m = this.doubleDomCache.get(bestEntry.element.urlHash().substring(6));
|
||||
o = m.pop();
|
||||
assert o.element.urlHash().equals(bestEntry.element.urlHash());
|
||||
return bestEntry;
|
||||
}
|
||||
|
||||
public synchronized indexURLEntry bestURL(boolean skipDoubleDom) {
|
||||
// returns from the current RWI list the best URL entry and removed this entry from the list
|
||||
while ((sortedRWIEntries.size() > 0) || (size() > 0)) {
|
||||
Object[] obrwi = bestRWI(skipDoubleDom);
|
||||
Object bestEntry = obrwi[0];
|
||||
indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1];
|
||||
long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0;
|
||||
indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking);
|
||||
while ((stack.size() > 0) || (size() > 0)) {
|
||||
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
|
||||
indexURLEntry u = wordIndex.loadedURL.load(obrwi.element.urlHash(), obrwi.element, obrwi.weight);
|
||||
if (u != null) {
|
||||
indexURLEntry.Components comp = u.comp();
|
||||
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
|
||||
return u;
|
||||
}
|
||||
misses.add(ientry.urlHash());
|
||||
misses.add(obrwi.element.urlHash());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public synchronized int size() {
|
||||
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
|
||||
int c = sortedRWIEntries.size();
|
||||
Iterator<TreeMap<Object, indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
int c = stack.size();
|
||||
Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();
|
||||
while (i.hasNext()) c += i.next().size();
|
||||
return c;
|
||||
}
|
||||
|
@ -362,7 +349,7 @@ public final class plasmaSearchRankingProcess {
|
|||
|
||||
public int filteredCount() {
|
||||
// the number of index entries that are considered as result set
|
||||
return this.sortedRWIEntries.size();
|
||||
return this.stack.size();
|
||||
}
|
||||
|
||||
public int getRemoteIndexCount() {
|
||||
|
@ -385,14 +372,11 @@ public final class plasmaSearchRankingProcess {
|
|||
return this.local_resourceSize;
|
||||
}
|
||||
|
||||
|
||||
public indexRWIEntry remove(String urlHash) {
|
||||
Object r = (Long) urlhashes.get(urlHash);
|
||||
if (r == null) return null;
|
||||
assert sortedRWIEntries.containsKey(r);
|
||||
indexRWIEntry iEntry = (indexRWIEntry) sortedRWIEntries.remove(r);
|
||||
kelondroSortStack<indexRWIVarEntry>.stackElement se = stack.remove(urlHash.hashCode());
|
||||
if (se == null) return null;
|
||||
urlhashes.remove(urlHash);
|
||||
return iEntry;
|
||||
return se.element;
|
||||
}
|
||||
|
||||
public Iterator<String> miss() {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# YaCy Network Group Definition
|
||||
# -----------------------------
|
||||
# This is an addition to the yacy.network.unit configuration file.
|
||||
# This file is adressed by the network.group.definition property in yacy.init
|
||||
# This file is addressed by the network.group.definition property in yacy.init
|
||||
# The purpose of a group within a network is that some parts of a network may be managed independently,
|
||||
# while the content of the network stays private for the whole network, mostly for a special purpose.
|
||||
# This file needs to be configured if someone wants to participate with several peers to the network,
|
||||
|
|
Loading…
Reference in New Issue
Block a user