mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhancements to ranking evaluation
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2523 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
a82e926c5d
commit
a8bc768206
|
@ -563,14 +563,46 @@ public class indexURL {
|
|||
byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
|
||||
// form the 'local' part of the hash
|
||||
String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, 5);
|
||||
char hash2 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
|
||||
char hash2 = subdomPortPath(subdom, port, rootpath);
|
||||
// form the 'global' part of the hash
|
||||
String hash1 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.getProtocol() + ":" + host + ":" + port)).substring(0, 5);
|
||||
String hash1 = protocolHostPort(url.getProtocol(), host, port);
|
||||
char hash0 = kelondroBase64Order.enhancedCoder.encodeByte(flagbyte);
|
||||
// combine the hashes
|
||||
return hash3 + hash2 + hash1 + hash0;
|
||||
}
|
||||
|
||||
private static final char[] rootURLFlags = new char[] {
|
||||
subdomPortPath("www", 80, ""),
|
||||
subdomPortPath("", 80, "")
|
||||
};
|
||||
|
||||
private static char subdomPortPath(String subdom, int port, String rootpath) {
|
||||
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
|
||||
}
|
||||
|
||||
public static final boolean probablyRootURL(String urlHash) {
|
||||
for (int i = 0; i < rootURLFlags.length; i++) if (urlHash.charAt(6) == rootURLFlags[i]) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static String protocolHostPort(String protocol, String host, int port) {
|
||||
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(protocol + ":" + host + ":" + port)).substring(0, 5);
|
||||
}
|
||||
|
||||
public static final boolean probablyWordURL(String urlHash, String word) {
|
||||
if (word == null) return false;
|
||||
String pattern = urlHash.substring(6, 11);
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".com", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".net", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".org", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".uk", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".fr", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".de", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".es", 80))) return true;
|
||||
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + ".it", 80))) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public static final int domLengthEstimation(String urlHash) {
|
||||
// generates an estimation of the original domain length
|
||||
int flagbyte = kelondroBase64Order.enhancedCoder.decodeByte(urlHash.charAt(11));
|
||||
|
|
|
@ -148,7 +148,7 @@ public class indexURLEntry implements Cloneable, indexEntry {
|
|||
}
|
||||
|
||||
public String toPropertyForm(boolean displayFormat) {
|
||||
return entry.toPropertyForm(true, displayFormat, displayFormat);
|
||||
return entry.toPropertyForm(false, displayFormat, displayFormat);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
|
|
|
@ -242,8 +242,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
|
|||
try {
|
||||
while (preorder.hasNext()) {
|
||||
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
|
||||
if (acc.sizeFetched() >= minEntries) break;
|
||||
if (System.currentTimeMillis() >= postorderLimitTime) break;
|
||||
//if (acc.sizeFetched() >= minEntries) break;
|
||||
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
|
||||
preorderEntry = preorder.next();
|
||||
entry = (indexEntry) preorderEntry[0];
|
||||
preranking = (Long) preorderEntry[1];
|
||||
|
@ -298,8 +298,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
|
|||
try {
|
||||
while (preorder.hasNext()) {
|
||||
//if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
|
||||
if (acc.sizeFetched() >= minEntries) break;
|
||||
if (System.currentTimeMillis() >= postorderLimitTime) break;
|
||||
//if (acc.sizeFetched() >= minEntries) break;
|
||||
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
|
||||
preorderEntry = preorder.next();
|
||||
entry = (indexEntry) preorderEntry[0];
|
||||
preranking = (Long) preorderEntry[1];
|
||||
|
|
|
@ -96,7 +96,7 @@ public final class plasmaSearchPreOrder {
|
|||
this.pageAcc = new TreeMap();
|
||||
for (int j = 0; j < count; j++) {
|
||||
iEntry = (indexEntry) i.next();
|
||||
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax)), 16) + iEntry.urlHash(), iEntry);
|
||||
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -164,7 +164,7 @@ public class plasmaSearchRankingProfile {
|
|||
return new String(ext);
|
||||
}
|
||||
|
||||
public long preRanking(indexEntry normalizedEntry) {
|
||||
public long preRanking(indexEntry normalizedEntry, String searchedWord) {
|
||||
// the normalizedEntry must be a normalized indexEntry
|
||||
long ranking = 0;
|
||||
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
|
||||
|
@ -174,6 +174,12 @@ public class plasmaSearchRankingProfile {
|
|||
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
|
||||
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
|
||||
ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
|
||||
ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
|
||||
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
|
||||
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
|
||||
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
|
||||
else
|
||||
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
|
||||
return ranking;
|
||||
}
|
||||
|
||||
|
@ -219,7 +225,6 @@ public class plasmaSearchRankingProfile {
|
|||
ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
|
||||
ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
|
||||
|
||||
|
||||
return ranking;
|
||||
}
|
||||
|
||||
|
|
|
@ -2084,7 +2084,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
prop.put("type_results_" + i + "_size", Long.toString(urlentry.size()));
|
||||
prop.put("type_results_" + i + "_words", URLEncoder.encode(query.queryWords.toString(),"UTF-8"));
|
||||
prop.put("type_results_" + i + "_former", formerSearch);
|
||||
prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true));
|
||||
prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm(true) + ", domLengthEstimated=" + indexURL.domLengthEstimation(urlhash) +
|
||||
((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") +
|
||||
((indexURL.probablyWordURL(urlhash, query.words(""))) ? ", probablyWordURL" : ""));
|
||||
// adding snippet if available
|
||||
if (snippet.exists()) {
|
||||
prop.put("type_results_" + i + "_snippet", 1);
|
||||
|
|
Loading…
Reference in New Issue
Block a user