mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
30888e7a2f
Such constraints may formulate specific restrictions to web searches This is implemented by scraping information for constraints from a web page during parsing, and storing flags to the pages within the web index. In this first step, only information for index pages ("index of", directory listings) are scraped and stored in flags - added new flag class kelondroBitfield - added scraper method in condenser - added bitfield structure for all scrape types (see also condenser) - added bitfield structure for appearance locations (see RWIEntry) - added handover protocol for remote search and index distribution - extended kelondroColumn class to hold bitfield types - added another search attribute on search page (index.html) - extended search-filter to enable filtering of non-matching constraints - set all new database types to be default - refactoring: moved word hash generation to condenser class git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2999 6c8d7289-2bf4-0310-a012-ef5d649a1542
236 lines
11 KiB
Java
236 lines
11 KiB
Java
// plasmaSearchRankingProfile.java
|
|
// -------------------------------
|
|
// part of YACY
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2006
|
|
// Created: 05.02.2006
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
|
// running) means that you agree that the Author(s) is (are) not responsible
|
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
|
// by usage of this softare or this documentation. The usage of this software
|
|
// is on your own risk. The installation and usage (starting/running) of this
|
|
// software may allow other people or application to access your computer and
|
|
// any attached devices and is highly dependent on the configuration of the
|
|
// software which must be done by the user of the software; the author(s) is
|
|
// (are) also not responsible for proper configuration and usage of the
|
|
// software, even if provoked by documentation provided together with
|
|
// the software.
|
|
//
|
|
// Any changes to this file according to the GPL as documented in the file
|
|
// gpl.txt aside this file in the shipment you received can be done to the
|
|
// lines that follows this copyright notice here, but changes must not be
|
|
// done inside the copyright notive above. A re-distribution must contain
|
|
// the intact and unchanged copyright notice.
|
|
// Contributions and changes to the program code must be marked as such.
|
|
|
|
package de.anomic.plasma;
|
|
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
|
|
import de.anomic.index.indexRWIEntry;
|
|
import de.anomic.plasma.plasmaURL;
|
|
import de.anomic.index.indexURLEntry;
|
|
|
|
public class plasmaSearchRankingProfile {
|
|
|
|
// old parameters for ordering
|
|
public static final String ORDER_QUALITY = "Quality";
|
|
public static final String ORDER_DATE = "Date";
|
|
public static final String ORDER_YBR = "YBR";
|
|
|
|
// pre-sort attributes
|
|
public static final String ENTROPY = "entropy";
|
|
public static final String DATE = "date";
|
|
public static final String YBR = "ybr";
|
|
public static final String POSINTEXT = "posintext";
|
|
public static final String WORDDISTANCE = "worddistance";
|
|
public static final String HITCOUNT = "hitcount";
|
|
public static final String DOMLENGTH = "domlength";
|
|
|
|
// post-sort attributes
|
|
public static final String URLLENGTH = "urllength";
|
|
public static final String URLCOMPS = "urlcomps";
|
|
public static final String DESCRLENGTH = "descrlength";
|
|
public static final String DESCRCOMPS = "descrcomps";
|
|
|
|
// post-sort predicates
|
|
public static final String QUERYINURL = "queryinurl";
|
|
public static final String QUERYINDESCR = "queryindescr";
|
|
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
|
|
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
|
|
public static final String PREFER = "prefer";
|
|
|
|
public String[] order;
|
|
private HashMap coeff;
|
|
|
|
public plasmaSearchRankingProfile() {
|
|
// set some default-values
|
|
this.order = null;
|
|
this.coeff = new HashMap();
|
|
coeff.put(ENTROPY, new Integer(0));
|
|
coeff.put(DATE, new Integer(4));
|
|
coeff.put(YBR, new Integer(8));
|
|
coeff.put(POSINTEXT, new Integer(7));
|
|
coeff.put(WORDDISTANCE, new Integer(6));
|
|
coeff.put(HITCOUNT, new Integer(5));
|
|
coeff.put(DOMLENGTH, new Integer(8));
|
|
coeff.put(URLLENGTH, new Integer(15));
|
|
coeff.put(URLCOMPS, new Integer(15));
|
|
coeff.put(DESCRLENGTH, new Integer(4));
|
|
coeff.put(DESCRCOMPS, new Integer(4));
|
|
coeff.put(QUERYINURL, new Integer(13));
|
|
coeff.put(QUERYINDESCR, new Integer(8));
|
|
coeff.put(URLCOMPINTOPLIST, new Integer(3));
|
|
coeff.put(DESCRCOMPINTOPLIST, new Integer(2));
|
|
coeff.put(PREFER, new Integer(15));
|
|
}
|
|
|
|
public plasmaSearchRankingProfile(String prefix, String profile) {
|
|
this(); // set defaults
|
|
//parse external form
|
|
String[] elts = profile.substring(1, profile.length() - 1).split(",");
|
|
int p;
|
|
int s = prefix.length();
|
|
String e;
|
|
for (int i = 0; i < elts.length; i++) {
|
|
e = elts[i].trim();
|
|
if ((s == 0) || (e.startsWith(prefix))) {
|
|
coeff.put(e.substring(s, (p = e.indexOf("="))), new Integer(Integer.parseInt(e.substring(p + 1))));
|
|
}
|
|
}
|
|
}
|
|
|
|
public plasmaSearchRankingProfile(String[] order) {
|
|
this(); // set defaults
|
|
this.order = order;
|
|
// overwrite defaults with order attributes
|
|
for (int i = 0; i < 3; i++) {
|
|
if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((3 * (3 - i))));
|
|
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((3 * (3 - i))));
|
|
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((3 * (3 - i))));
|
|
}
|
|
}
|
|
|
|
public String orderString() {
|
|
if (order == null) return "YBR-Date-Quality";
|
|
return order[0] + "-" + order[1] + "-" + order[2];
|
|
}
|
|
|
|
public String toExternalString() {
|
|
return coeff.toString();
|
|
}
|
|
|
|
public Map toExternalMap(String prefix) {
|
|
Iterator i = this.coeff.entrySet().iterator();
|
|
Map.Entry entry;
|
|
Map ext = new HashMap();
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
ext.put(prefix + (String) entry.getKey(), entry.getValue());
|
|
}
|
|
return ext;
|
|
}
|
|
|
|
public String toExternalURLGet(String prefix) {
|
|
Iterator i = this.coeff.entrySet().iterator();
|
|
Map.Entry entry;
|
|
StringBuffer ext = new StringBuffer();
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
ext.append("&");
|
|
ext.append(prefix);
|
|
ext.append((String) entry.getKey());
|
|
ext.append("=");
|
|
ext.append(entry.getValue());
|
|
}
|
|
return new String(ext);
|
|
}
|
|
|
|
public long preRanking(indexRWIEntry normalizedEntry, String searchedWord) {
|
|
// the normalizedEntry must be a normalized indexEntry
|
|
long ranking = 0;
|
|
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
|
|
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
|
|
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
|
|
ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
|
|
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
|
|
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
|
|
ranking += (256 - plasmaURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
|
|
ranking += (plasmaURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
|
|
ranking += (plasmaURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord) != null) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
|
|
/*
|
|
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
|
|
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
|
|
else
|
|
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
|
|
*/
|
|
return ranking;
|
|
}
|
|
|
|
public long postRanking(
|
|
long preranking,
|
|
plasmaSearchQuery query,
|
|
Set topwords,
|
|
String[] urlcomps,
|
|
String[] descrcomps,
|
|
indexURLEntry page) {
|
|
|
|
// apply pre-calculated order attributes
|
|
long ranking = preranking;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
indexURLEntry.Components comp = page.comp();
|
|
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
|
|
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
for (int j = 0; j < urlcomps.length; j++) {
|
|
if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
|
|
}
|
|
for (int j = 0; j < descrcomps.length; j++) {
|
|
if (topwords.contains(descrcomps[j])) ranking += 256 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue();
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
Set urlcomph = plasmaCondenser.words2hashSet(urlcomps);
|
|
Set descrcomph = plasmaCondenser.words2hashSet(descrcomps);
|
|
Iterator shi = query.queryHashes.iterator();
|
|
String queryhash;
|
|
while (shi.hasNext()) {
|
|
queryhash = (String) shi.next();
|
|
if (urlcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINURL)).intValue();
|
|
if (descrcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINDESCR)).intValue();
|
|
}
|
|
|
|
// prefer short urls
|
|
ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
|
|
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
|
|
|
|
// prefer long descriptions
|
|
ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
|
|
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
|
|
|
|
return ranking;
|
|
}
|
|
|
|
}
|