mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
added word-position to ranking (this is only a first step)
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1395 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
bb2095fe39
commit
fc4ae899f7
|
@ -114,40 +114,6 @@ public final class plasmaCondenser {
|
|||
return words.entrySet().iterator();
|
||||
}
|
||||
|
||||
/*
|
||||
public int wordCount(String word) {
|
||||
// number of occurrences of one word
|
||||
// if the word did not occur, this simply returns 0
|
||||
wordStatProp sp = (wordStatProp) words.get(word);
|
||||
if (sp == null) return 0;
|
||||
return sp.count;
|
||||
}
|
||||
|
||||
public int wordPositionInText(String word) {
|
||||
// position of word in text
|
||||
// if unknown and word does not exist, the method returns 0
|
||||
wordStatProp sp = (wordStatProp) words.get(word);
|
||||
if (sp == null) return 0;
|
||||
return sp.posInText;
|
||||
}
|
||||
|
||||
public int wordPositionInPhrase(String word) {
|
||||
// position of word in text
|
||||
// if unknown and word does not exist, the method returns 0
|
||||
wordStatProp sp = (wordStatProp) words.get(word);
|
||||
if (sp == null) return 0;
|
||||
return sp.posInPhrase;
|
||||
}
|
||||
|
||||
public int wordNumberOfPhrase(String word) {
|
||||
// position of word in text
|
||||
// if unknown and word does not exist, the method returns 0
|
||||
wordStatProp sp = (wordStatProp) words.get(word);
|
||||
if (sp == null) return 0;
|
||||
return sp.numOfPhrase;
|
||||
}
|
||||
*/
|
||||
|
||||
public static class wordStatProp {
|
||||
// object carries statistics for words and sentences
|
||||
|
||||
|
|
|
@ -137,7 +137,9 @@ public final class plasmaSearchPreOrder {
|
|||
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash());
|
||||
factor = factor / 4096L;
|
||||
}
|
||||
|
||||
int wordpos = indexEntry.posintext();
|
||||
if (wordpos == 0) wordpos = 1000;
|
||||
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
|
||||
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
|
||||
}
|
||||
|
||||
|
|
|
@ -157,6 +157,9 @@ public final class plasmaSearchResult {
|
|||
else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
|
||||
factor = factor / 4096L;
|
||||
}
|
||||
int wordpos = indexEntry.posintext();
|
||||
if (wordpos == 0) wordpos = 1000;
|
||||
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
|
||||
|
||||
// apply 'common-sense' heuristic using references
|
||||
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
|
||||
|
|
|
@ -73,13 +73,13 @@ public final class plasmaWordIndexEntry {
|
|||
private final String urlHash;
|
||||
|
||||
// discrete values
|
||||
private int hitcount; // words in file
|
||||
private int wordcount;
|
||||
private int phrasecount;
|
||||
private int hitcount; // number of this words in file
|
||||
private int wordcount; // number of all words in the file
|
||||
private int phrasecount; // number of all phrases in the file
|
||||
private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
|
||||
private int posinphrase; // position within a phrase of the word
|
||||
private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
|
||||
private int worddistance;
|
||||
private int worddistance;// distance between the words, only used if the index is artificial (from a conjunction)
|
||||
private long lastModified;// calculated by using last-modified
|
||||
private int quality; // result of a heuristic on the source file
|
||||
private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
|
||||
|
@ -101,12 +101,12 @@ public final class plasmaWordIndexEntry {
|
|||
|
||||
// appearance locations: (used for flags)
|
||||
public static final int AP_TITLE = 0; // title tag from html header
|
||||
public static final int AP_H1 = 1; // h0-tag
|
||||
public static final int AP_H2 = 2;
|
||||
public static final int AP_H3 = 3;
|
||||
public static final int AP_H4 = 4;
|
||||
public static final int AP_H5 = 5;
|
||||
public static final int AP_H6 = 6;
|
||||
public static final int AP_H1 = 1; // h1-tag
|
||||
public static final int AP_H2 = 2; // h2-tag
|
||||
public static final int AP_H3 = 3; // h3-tag
|
||||
public static final int AP_H4 = 4; // h4-tag
|
||||
public static final int AP_H5 = 5; // h5-tag
|
||||
public static final int AP_H6 = 6; // h6-tag
|
||||
public static final int AP_ANCHOR = 7; // anchor description
|
||||
public static final int AP_URL = 8; // word inside an url
|
||||
public static final int AP_IMG = 9; // tag inside image references
|
||||
|
@ -254,6 +254,9 @@ public final class plasmaWordIndexEntry {
|
|||
this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0;
|
||||
this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0;
|
||||
this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0;
|
||||
if (hitcount == 0) hitcount = 1;
|
||||
if (wordcount == 0) wordcount = 1000;
|
||||
if (phrasecount == 0) phrasecount = 100;
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntry(String external) {
|
||||
|
@ -335,7 +338,7 @@ public final class plasmaWordIndexEntry {
|
|||
public int getQuality() { return quality; }
|
||||
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
|
||||
public long getLastModified() { return lastModified; }
|
||||
public int getCount() { return hitcount; }
|
||||
public int hitcount() { return hitcount; }
|
||||
public int posintext() { return posintext; }
|
||||
public int posinphrase() { return posinphrase; }
|
||||
public int posofphrase() { return posofphrase; }
|
||||
|
|
Loading…
Reference in New Issue
Block a user