yacy_search_server/source/de/anomic/plasma/plasmaSearchRankingProcess.java
orbiter 99bf0b8e41 refactoring of plasmaWordIndex:
divided that class into three parts:
- the peers object is now hosted by the plasmaSwitchboard
- the crawler elements are now in a new class, crawler.CrawlerSwitchboard
- the index elements are core of the new segment data structure, which is a bundle of different indexes for the full text and (in the future) navigation indexes and the metadata store. The new class is now in kelondro.text.Segment

The refactoring is inspired by the roadmap to create index segments, the option to host different indexes on one peer.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5990 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-05-28 14:26:05 +00:00

563 lines
25 KiB
Java

// plasmaSearchRankingProcess.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.index.BinSearch;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceOrder;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess {
public static BinSearch[] ybrTables = null; // block-rank tables
public static final int maxYBR = 3; // the lower this value, the faster the search
private static boolean useYBR = true;
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
private final SortStack<WordReferenceVars> stack;
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private final plasmaSearchQuery query;
private final int maxentries;
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private final ReferenceOrder order;
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final ScoreCluster<String> ref; // reference score computation for the commonSense heuristic
private final int[] flagcount; // flag counter
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
private final Segment indexSegment;
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
private final int[] domZones;
private HashMap<String, hoststat> hostNavigator;
public plasmaSearchRankingProcess(
final Segment indexSegment,
final plasmaSearchQuery query,
final int maxentries,
final int concurrency) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchContainerMaps = null;
this.stack = new SortStack<WordReferenceVars>(maxentries);
this.doubleDomCache = new HashMap<String, SortStack<WordReferenceVars>>();
this.handover = new HashMap<String, String>();
this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang);
this.query = query;
this.maxentries = maxentries;
this.remote_peerCount = 0;
this.remote_indexCount = 0;
this.remote_resourceSize = 0;
this.local_resourceSize = 0;
this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency);
this.ref = new ScoreCluster<String>();
this.misses = new TreeSet<String>();
this.indexSegment = indexSegment;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.domZones = new int[8];
this.hostNavigator = new HashMap<String, hoststat>();
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
}
public long ranking(final WordReferenceVars word) {
return order.cardinal(word);
}
public int[] zones() {
return this.domZones;
}
public void execQuery() {
long timer = System.currentTimeMillis();
this.localSearchContainerMaps = indexSegment.index().searchTerm(query.queryHashes, query.excludeHashes, null);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
// join and exclude the local result
timer = System.currentTimeMillis();
final ReferenceContainer<WordReference> index =
ReferenceContainer.joinExcludeContainers(
Segment.wordReferenceFactory,
this.localSearchContainerMaps[0].values(),
this.localSearchContainerMaps[1].values(),
query.maxDistance);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false);
if (index.size() == 0) {
return;
}
insertRanked(index, true, index.size());
}
public void insertRanked(final ReferenceContainer<WordReference> index, final boolean local, final int fullResource) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
assert (index != null);
if (index.size() == 0) return;
if (local) {
this.local_resourceSize += fullResource;
} else {
this.remote_resourceSize += fullResource;
this.remote_peerCount++;
}
long timer = System.currentTimeMillis();
// normalize entries
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r;
hoststat hs;
String domhash;
while (i.hasNext()) {
iEntry = i.next();
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
}
// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(iEntry));
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
// check constraints
if (!testFlags(iEntry)) continue;
// check document domain
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}
// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}
// check site constraints
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}
// count domZones
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
// get statistics for host navigator
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
} else {
hs.inc();
}
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
}
// double-check
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
}
// increase counter for statistics
if (!local) this.remote_indexCount++;
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
}
public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};
public class hostnaventry {
public int count;
public String host;
public hostnaventry(String host, int count) {
this.host = host;
this.count = count;
}
}
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(maxentries, hsa.length);
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
mr = indexSegment.metadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new hostnaventry(url.getHost(), hsa[i].count));
}
return result;
}
private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
}
return true;
}
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
}
return false;
}
public Map<byte[], ReferenceContainer<WordReference>>[] searchContainerMaps() {
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
return localSearchContainerMaps;
}
// todo:
// - remove redundant urls (sub-path occurred before)
// - move up shorter urls
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private SortStack<WordReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list
SortStack<WordReferenceVars> m;
SortStack<WordReferenceVars>.stackElement rwi;
while (stack.size() > 0) {
rwi = stack.pop();
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
if (!skipDoubleDom) return rwi;
// check doubledom
final String domhash = rwi.element.metadataHash().substring(6);
m = this.doubleDomCache.get(domhash);
if (m == null) {
// first appearance of dom
m = new SortStack<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
this.doubleDomCache.put(domhash, m);
return rwi;
}
// second appearances of dom
m.push(rwi);
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
SortStack<WordReferenceVars>.stackElement bestEntry = null;
SortStack<WordReferenceVars>.stackElement o;
while (i.hasNext()) {
m = i.next();
if (m == null) continue;
if (m.size() == 0) continue;
if (bestEntry == null) {
bestEntry = m.top();
continue;
}
o = m.top();
if (o.weight.longValue() < bestEntry.weight.longValue()) {
bestEntry = o;
}
}
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.element.metadataHash().substring(6));
o = m.pop();
assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash()) : "bestEntry.element.metadataHash() = " + bestEntry.element.metadataHash() + ", o.element.metadataHash() = " + o.element.metadataHash();
return bestEntry;
}
public URLMetadataRow bestURL(final boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removes this entry from the list
while ((stack.size() > 0) || (size() > 0)) {
if (((stack.size() == 0) && (size() == 0))) break;
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
final URLMetadataRow u = indexSegment.metadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadataRow.Components metadata = u.metadata();
if (metadata.url() != null) {
String urlstring = metadata.url().toNormalform(true, true);
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
}
misses.add(obrwi.element.metadataHash());
}
return null;
}
public int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = stack.size();
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
while (i.hasNext()) c += i.next().size();
return c;
}
public int[] flagCount() {
return flagcount;
}
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
public int filteredCount() {
// the number of index entries that are considered as result set
return this.stack.size();
}
public int getRemoteIndexCount() {
// the number of result contributions from all the remote peers
return this.remote_indexCount;
}
public int getRemotePeerCount() {
// the number of remote peers that have contributed
return this.remote_peerCount;
}
public int getRemoteResourceSize() {
// the number of all hits in all the remote peers
return this.remote_resourceSize;
}
public int getLocalResourceSize() {
// the number of hits in the local peer (index size, size of the collection in the own index)
return this.local_resourceSize;
}
public Reference remove(final String urlHash) {
final SortStack<WordReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
if (se == null) return null;
urlhashes.remove(urlHash);
return se.element;
}
public Iterator<String> miss() {
return this.misses.iterator();
}
public Set<String> getReferences(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
final TreeSet<String> s = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
for (int i = 0; i < refs.length; i++) {
s.add((String) refs[i]);
}
return s;
}
public void addReferences(final String[] words) {
String word;
for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase();
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(Word.word2hash(word)))))
ref.incScore(word);
}
}
protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// add references
addReferences(urlcomps);
addReferences(descrcomps);
}
public ReferenceOrder getOrder() {
return this.order;
}
public static void loadYBR(final File rankingPath, final int count) {
// load ranking tables
if (rankingPath.exists()) {
ybrTables = new BinSearch[count];
String ybrName;
File f;
try {
for (int i = 0; i < count; i++) {
ybrName = "YBR-4-" + Digest.encodeHex(i, 2) + ".idx";
f = new File(rankingPath, ybrName);
if (f.exists()) {
ybrTables[i] = new BinSearch(FileUtils.read(f), 6);
} else {
ybrTables[i] = null;
}
}
} catch (final IOException e) {
ybrTables = null;
}
} else {
ybrTables = null;
}
}
public static boolean canUseYBR() {
return ybrTables != null;
}
public static boolean isUsingYBR() {
return useYBR;
}
public static void switchYBR(final boolean usage) {
useYBR = usage;
}
public static int ybr(final String urlHash) {
// returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
if (ybrTables == null) return 15;
if (!(useYBR)) return 15;
final String domHash = urlHash.substring(6);
final int m = Math.min(maxYBR, ybrTables.length);
for (int i = 0; i < m; i++) {
if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 15;
}
public long postRanking(
final Set<String> topwords,
final plasmaSearchEvent.ResultEntry rentry,
final int position) {
long r = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
final String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
final Iterator<byte[]> shi = query.queryHashes.iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
}
return r;
}
}