mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
b45701d20f
This time it works like this: - each peer provides its ranking information using the yacy/idx.json servlet - peers with more than 1 GB ram will load this information from all other peers, combine that into one ranking table and store it locally. This happens during the start-up of the peer concurrently. The new generated file with the ranking information is at DATA/INDEX/<network>/QUEUES/hostIndex.blob - this index is then computed to generate a new fresh ranking table. Peers which can calculate their own ranking table will do that every start-up to get latest feature updates until the feature is stable - I computed new ranking tables as part of the distribition and commit it here also - the YBR feature must be enabled manually by setting the YBR value in the ranking servlet to level 15. A default configuration for that is also in the commit but it does not affect your current installation only fresh peers - a recursive block rank refinement is implemented but disabled at this point. it needs more testing Please play around with the ranking settings and see if this helped to make search results better. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7729 6c8d7289-2bf4-0310-a012-ef5d649a1542
690 lines
31 KiB
Java
690 lines
31 KiB
Java
// RankingProcess.java
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 07.11.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.search;
|
|
|
|
import java.util.Comparator;
|
|
import java.util.ConcurrentModificationException;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.SortedMap;
|
|
import java.util.SortedSet;
|
|
import java.util.TreeSet;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.document.UTF8;
|
|
import net.yacy.cora.protocol.Scanner;
|
|
import net.yacy.cora.storage.ConcurrentScoreMap;
|
|
import net.yacy.cora.storage.ClusteredScoreMap;
|
|
import net.yacy.cora.storage.ScoreMap;
|
|
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
|
|
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.data.word.WordReference;
|
|
import net.yacy.kelondro.data.word.WordReferenceVars;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
import net.yacy.kelondro.rwi.TermSearch;
|
|
import net.yacy.kelondro.util.EventTracker;
|
|
|
|
import de.anomic.yacy.graphics.ProfilingGraph;
|
|
|
|
public final class RankingProcess extends Thread {
|
|
|
|
private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000;
|
|
|
|
private final QueryParams query;
|
|
private final SortedSet<byte[]> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
|
|
private final int[] flagcount; // flag counter
|
|
private final SortedSet<byte[]> misses; // contains url-hashes that could not been found in the LURL-DB
|
|
private int sortout; // counter for referenced that had been sorted out for other reasons
|
|
//private final int[] domZones;
|
|
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
|
|
|
private int remote_resourceSize, remote_indexCount, remote_peerCount;
|
|
private int local_resourceSize, local_indexCount;
|
|
private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
|
|
private int feeders;
|
|
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
|
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
|
|
|
|
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
|
|
private final ScoreMap<String> hostNavigator; // a counter for the appearance of the host hash
|
|
private final Map<String, String> hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
|
|
private final ScoreMap<String> authorNavigator;
|
|
private final ScoreMap<String> namespaceNavigator;
|
|
private final ReferenceOrder order;
|
|
private final long startTime;
|
|
private boolean addRunning;
|
|
|
|
public RankingProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) {
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
|
this.addRunning = true;
|
|
this.localSearchInclusion = null;
|
|
this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(maxentries);
|
|
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
|
|
this.query = query;
|
|
this.order = order;
|
|
this.remote_peerCount = 0;
|
|
this.remote_resourceSize = 0;
|
|
this.remote_indexCount = 0;
|
|
this.local_resourceSize = 0;
|
|
this.local_indexCount = 0;
|
|
this.urlhashes = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
|
|
//this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
|
|
this.misses = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
|
|
//this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
|
|
this.sortout = 0;
|
|
this.flagcount = new int[32];
|
|
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
|
this.hostNavigator = new ConcurrentScoreMap<String>();
|
|
this.hostResolver = new ConcurrentHashMap<String, String>();
|
|
this.authorNavigator = new ConcurrentScoreMap<String>();
|
|
this.namespaceNavigator = new ConcurrentScoreMap<String>();
|
|
this.ref = new ConcurrentScoreMap<String>();
|
|
this.feeders = 1;
|
|
this.startTime = System.currentTimeMillis();
|
|
}
|
|
|
|
public QueryParams getQuery() {
|
|
return this.query;
|
|
}
|
|
|
|
public ReferenceOrder getOrder() {
|
|
return this.order;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
// do a search
|
|
|
|
// sort the local containers and truncate it to a limited count,
|
|
// so following sortings together with the global results will be fast
|
|
try {
|
|
long timer = System.currentTimeMillis();
|
|
final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
|
|
query.queryHashes,
|
|
query.excludeHashes,
|
|
null,
|
|
Segment.wordReferenceFactory,
|
|
query.maxDistance);
|
|
this.localSearchInclusion = search.inclusion();
|
|
final ReferenceContainer<WordReference> index = search.joined();
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.JOIN, query.queryString, index.size(), System.currentTimeMillis() - timer), false);
|
|
if (index.isEmpty()) {
|
|
return;
|
|
}
|
|
|
|
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true);
|
|
} catch (final Exception e) {
|
|
Log.logException(e);
|
|
} finally {
|
|
oneFeederTerminated();
|
|
}
|
|
}
|
|
|
|
public void add(
|
|
final ReferenceContainer<WordReference> index,
|
|
final boolean local,
|
|
String resourceName,
|
|
final int fullResource,
|
|
final boolean finalizeAddAtEnd) {
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
|
|
this.addRunning = true;
|
|
|
|
assert (index != null);
|
|
if (index.isEmpty()) return;
|
|
|
|
if (local) {
|
|
this.local_resourceSize += index.size();
|
|
} else {
|
|
assert fullResource >= 0 : "fullResource = " + fullResource;
|
|
this.remote_resourceSize += fullResource;
|
|
this.remote_peerCount++;
|
|
}
|
|
|
|
long timer = System.currentTimeMillis();
|
|
|
|
// normalize entries
|
|
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
|
|
|
|
// iterate over normalized entries and select some that are better than currently stored
|
|
timer = System.currentTimeMillis();
|
|
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
|
|
|
|
// apply all constraints
|
|
try {
|
|
WordReferenceVars iEntry;
|
|
while (true) {
|
|
iEntry = decodedEntries.poll(1, TimeUnit.SECONDS);
|
|
if (iEntry == null || iEntry == WordReferenceVars.poison) break;
|
|
assert (iEntry.metadataHash().length == index.row().primaryKeyLength);
|
|
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
|
|
|
// increase flag counts
|
|
for (int j = 0; j < 32; j++) {
|
|
if (iEntry.flags().get(j)) {flagcount[j]++;}
|
|
}
|
|
|
|
// check constraints
|
|
if (!testFlags(iEntry)) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// check document domain
|
|
if (query.contentdom != ContentDomain.TEXT) {
|
|
if ((query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { this.sortout++; continue; }
|
|
if ((query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { this.sortout++; continue; }
|
|
if ((query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { this.sortout++; continue; }
|
|
if ((query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { this.sortout++; continue; }
|
|
}
|
|
|
|
// check tld domain
|
|
/*
|
|
if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) {
|
|
// filter out all tld that do not match with wanted tld domain
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
// count domZones
|
|
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
|
|
|
|
// check site constraints
|
|
String domhash = UTF8.String(iEntry.metadataHash(), 6, 6);
|
|
if (query.sitehash == null) {
|
|
// no site constraint there; maybe collect host navigation information
|
|
if (nav_hosts && query.urlMask_isCatchall) {
|
|
this.hostNavigator.inc(domhash);
|
|
this.hostResolver.put(domhash, UTF8.String(iEntry.metadataHash()));
|
|
}
|
|
} else {
|
|
if (!domhash.equals(query.sitehash)) {
|
|
// filter out all domains that do not match with the site constraint
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// finally make a double-check and insert result to stack
|
|
if (urlhashes.add(iEntry.metadataHash())) {
|
|
rankingtryloop: while (true) {
|
|
try {
|
|
stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
|
|
break rankingtryloop;
|
|
} catch (ArithmeticException e) {
|
|
// this may happen if the concurrent normalizer changes values during cardinal computation
|
|
continue rankingtryloop;
|
|
}
|
|
}
|
|
// increase counter for statistics
|
|
if (local) this.local_indexCount++; else this.remote_indexCount++;
|
|
}
|
|
}
|
|
|
|
} catch (InterruptedException e) {} finally {
|
|
if (finalizeAddAtEnd) this.addRunning = false;
|
|
}
|
|
|
|
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
|
|
}
|
|
|
|
/**
|
|
* method to signal the incoming stack that one feeder has terminated
|
|
*/
|
|
public void oneFeederTerminated() {
|
|
this.feeders--;
|
|
assert this.feeders >= 0 : "feeders = " + this.feeders;
|
|
}
|
|
|
|
protected void moreFeeders(final int countMoreFeeders) {
|
|
this.feeders += countMoreFeeders;
|
|
}
|
|
|
|
public boolean feedingIsFinished() {
|
|
return System.currentTimeMillis() - this.startTime > 50 && this.feeders == 0;
|
|
}
|
|
|
|
private boolean testFlags(final WordReference ientry) {
|
|
if (query.constraint == null) return true;
|
|
// test if ientry matches with filter
|
|
// if all = true: let only entries pass that has all matching bits
|
|
// if all = false: let all entries pass that has at least one matching bit
|
|
if (query.allofconstraint) {
|
|
for (int i = 0; i < 32; i++) {
|
|
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
|
|
}
|
|
return true;
|
|
}
|
|
for (int i = 0; i < 32; i++) {
|
|
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
|
|
// direct access to the result maps is needed for abstract generation
|
|
// this is only available if execQuery() was called before
|
|
return localSearchInclusion;
|
|
}
|
|
|
|
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(final boolean skipDoubleDom, final long waitingtime) {
|
|
|
|
// returns from the current RWI list the best entry and removes this entry from the list
|
|
WeakPriorityBlockingQueue<WordReferenceVars> m;
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
|
|
|
|
// take one entry from the stack if there are entries on that stack or the feeding is not yet finished
|
|
try {
|
|
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
|
|
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
|
|
long timeout = System.currentTimeMillis() + waitingtime;
|
|
while (((!feedingIsFinished() && this.addRunning) || stack.sizeQueue() > 0) &&
|
|
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) {
|
|
if (waitingtime <= 0) {
|
|
rwi = stack.poll();
|
|
} else timeoutloop:while (System.currentTimeMillis() < timeout) {
|
|
if (feedingIsFinished() && stack.sizeQueue() == 0) break timeoutloop;
|
|
rwi = stack.poll(50);
|
|
if (rwi != null) break timeoutloop;
|
|
}
|
|
if (rwi == null) break;
|
|
if (!skipDoubleDom) {
|
|
//System.out.println("!skipDoubleDom");
|
|
return rwi;
|
|
}
|
|
|
|
// check doubledom
|
|
final String domhash = UTF8.String(rwi.getElement().metadataHash(), 6, 6);
|
|
synchronized (this.doubleDomCache) {
|
|
m = this.doubleDomCache.get(domhash);
|
|
if (m == null) {
|
|
// first appearance of dom. we create an entry to signal that one of that domain was already returned
|
|
m = new WeakPriorityBlockingQueue<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
|
|
this.doubleDomCache.put(domhash, m);
|
|
return rwi;
|
|
}
|
|
// second appearances of dom
|
|
m.put(rwi);
|
|
}
|
|
}
|
|
} catch (InterruptedException e1) {}
|
|
if (this.doubleDomCache.isEmpty()) return null;
|
|
|
|
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
|
// find best entry from all caches
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
|
|
synchronized (this.doubleDomCache) {
|
|
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
|
while (i.hasNext()) {
|
|
try {
|
|
m = i.next();
|
|
} catch (ConcurrentModificationException e) {
|
|
Log.logException(e);
|
|
continue; // not the best solution...
|
|
}
|
|
if (m == null) continue;
|
|
if (m.isEmpty()) continue;
|
|
if (bestEntry == null) {
|
|
bestEntry = m.peek();
|
|
continue;
|
|
}
|
|
o = m.peek();
|
|
if (o == null) continue;
|
|
if (o.getWeight() < bestEntry.getWeight()) {
|
|
bestEntry = o;
|
|
}
|
|
}
|
|
if (bestEntry == null) return null;
|
|
|
|
// finally remove the best entry from the doubledom cache
|
|
m = this.doubleDomCache.get(UTF8.String(bestEntry.getElement().metadataHash()).substring(6));
|
|
bestEntry = m.poll();
|
|
}
|
|
return bestEntry;
|
|
}
|
|
|
|
/**
|
|
* get one metadata entry from the ranked results. This will be the 'best' entry so far
|
|
* according to the applied ranking. If there are no more entries left or the timeout
|
|
* limit is reached then null is returned. The caller may distinguish the timeout case
|
|
* from the case where there will be no more also in the future by calling this.feedingIsFinished()
|
|
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
|
|
* @param waitingtime the time this method may take for a result computation
|
|
* @return a metadata entry for a url
|
|
*/
|
|
public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) {
|
|
// returns from the current RWI list the best URL entry and removes this entry from the list
|
|
final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
|
|
int p = -1;
|
|
long timeleft;
|
|
while ((timeleft = timeout - System.currentTimeMillis()) > 0) {
|
|
//System.out.println("timeleft = " + timeleft);
|
|
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
|
|
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
|
|
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
|
|
if (page == null) {
|
|
misses.add(obrwi.getElement().metadataHash());
|
|
continue;
|
|
}
|
|
|
|
// prepare values for constraint check
|
|
final URIMetadataRow.Components metadata = page.metadata();
|
|
|
|
// check errors
|
|
if (metadata == null) {
|
|
this.sortout++;
|
|
continue; // rare case where the url is corrupted
|
|
}
|
|
|
|
if (!query.urlMask_isCatchall) {
|
|
// check url mask
|
|
if (!metadata.matches(query.urlMask)) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// in case that we do not have e catchall filter for urls
|
|
// we must also construct the domain navigator here
|
|
//if (query.sitehash == null) {
|
|
// this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
|
|
// this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
|
|
//}
|
|
}
|
|
|
|
// check for more errors
|
|
if (metadata.url() == null) {
|
|
this.sortout++;
|
|
continue; // rare case where the url is corrupted
|
|
}
|
|
|
|
final String pageurl = metadata.url().toNormalform(true, true);
|
|
final String pageauthor = metadata.dc_creator();
|
|
final String pagetitle = metadata.dc_title().toLowerCase();
|
|
|
|
// check exclusion
|
|
if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) ||
|
|
(QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) ||
|
|
(QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// check index-of constraint
|
|
if ((query.constraint != null) &&
|
|
(query.constraint.get(Condenser.flag_cat_indexof)) &&
|
|
(!(pagetitle.startsWith("index of")))) {
|
|
final Iterator<byte[]> wi = query.queryHashes.iterator();
|
|
while (wi.hasNext()) {
|
|
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
|
|
}
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// check location constraint
|
|
if ((query.constraint != null) &&
|
|
(query.constraint.get(Condenser.flag_cat_haslocation)) &&
|
|
(metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// check content domain
|
|
if ((query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
|
|
(query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
|
|
(query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
|
|
(query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// evaluate information of metadata for navigation
|
|
// author navigation:
|
|
if (pageauthor != null && pageauthor.length() > 0) {
|
|
// add author to the author navigator
|
|
String authorhash = UTF8.String(Word.word2hash(pageauthor));
|
|
|
|
// check if we already are filtering for authors
|
|
if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// add author to the author navigator
|
|
this.authorNavigator.inc(pageauthor);
|
|
} else if (this.query.authorhash != null) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// namespace navigation
|
|
String pagepath = metadata.url().getPath();
|
|
if ((p = pagepath.indexOf(':')) >= 0) {
|
|
pagepath = pagepath.substring(0,p);
|
|
p = pagepath.lastIndexOf('/');
|
|
if (p >= 0) {
|
|
pagepath = pagepath.substring(p + 1);
|
|
this.namespaceNavigator.inc(pagepath);
|
|
}
|
|
}
|
|
|
|
// check Scanner
|
|
if (!Scanner.acceptURL(metadata.url())) {
|
|
this.sortout++;
|
|
continue;
|
|
}
|
|
|
|
// accept url
|
|
return page;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public int sizeQueue() {
|
|
int c = stack.sizeQueue();
|
|
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
|
|
c += s.sizeQueue();
|
|
}
|
|
return c;
|
|
}
|
|
|
|
public int sizeAvailable() {
|
|
int c = stack.sizeAvailable();
|
|
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
|
|
c += s.sizeAvailable();
|
|
}
|
|
return c;
|
|
}
|
|
|
|
public boolean isEmpty() {
|
|
if (!stack.isEmpty()) return false;
|
|
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
|
|
if (!s.isEmpty()) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public int[] flagCount() {
|
|
return flagcount;
|
|
}
|
|
|
|
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
|
|
|
|
public int filteredCount() {
|
|
// the number of index entries that are considered as result set
|
|
return this.stack.sizeAvailable();
|
|
}
|
|
|
|
public int getLocalIndexCount() {
|
|
// the number of results in the local peer after filtering
|
|
return this.local_indexCount;
|
|
}
|
|
|
|
public int getRemoteIndexCount() {
|
|
// the number of result contributions from all the remote peers
|
|
return this.remote_indexCount;
|
|
}
|
|
|
|
public int getRemoteResourceSize() {
|
|
// the number of all hits in all the remote peers
|
|
return Math.max(this.remote_resourceSize, this.remote_indexCount);
|
|
}
|
|
|
|
public int getRemotePeerCount() {
|
|
// the number of remote peers that have contributed
|
|
return this.remote_peerCount;
|
|
}
|
|
|
|
public Iterator<byte[]> miss() {
|
|
return this.misses.iterator();
|
|
}
|
|
|
|
public int getMissCount() {
|
|
return this.misses.size();
|
|
}
|
|
|
|
public int getSortOutCount() {
|
|
return this.sortout;
|
|
}
|
|
|
|
public ScoreMap<String> getNamespaceNavigator() {
|
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace") < 0) return new ClusteredScoreMap<String>();
|
|
if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful
|
|
return this.namespaceNavigator;
|
|
}
|
|
|
|
public ScoreMap<String> getHostNavigator() {
|
|
ScoreMap<String> result = new ConcurrentScoreMap<String>();
|
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result;
|
|
|
|
final Iterator<String> domhashs = this.hostNavigator.keys(false);
|
|
URIMetadataRow row;
|
|
String domhash, urlhash, hostname;
|
|
if (this.hostResolver != null) while (domhashs.hasNext() && result.sizeSmaller(30)) {
|
|
domhash = domhashs.next();
|
|
if (domhash == null) continue;
|
|
urlhash = this.hostResolver.get(domhash);
|
|
row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(UTF8.getBytes(urlhash));
|
|
hostname = row == null ? null : row.metadata().url().getHost();
|
|
if (hostname != null) {
|
|
result.set(hostname, this.hostNavigator.get(domhash));
|
|
}
|
|
}
|
|
if (result.sizeSmaller(2)) result.clear(); // navigators with one entry are not useful
|
|
return result;
|
|
}
|
|
|
|
public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
|
|
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
|
|
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
|
|
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
public ScoreMap<String> getTopicNavigator(int count) {
|
|
// create a list of words that had been computed by statistics over all
|
|
// words that appeared in the url or the description of all urls
|
|
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
|
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return result;
|
|
if (this.ref.sizeSmaller(2)) this.ref.clear(); // navigators with one entry are not useful
|
|
final Map<String, Float> counts = new HashMap<String, Float>();
|
|
final Iterator<String> i = this.ref.keys(false);
|
|
String word;
|
|
byte[] termHash;
|
|
int c;
|
|
float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
|
|
int ic = count;
|
|
while (ic-- > 0 && i.hasNext()) {
|
|
word = i.next();
|
|
if (word == null) continue;
|
|
termHash = Word.word2hash(word);
|
|
c = this.query.getSegment().termIndex().count(termHash);
|
|
if (c > 0) {
|
|
q = ((float) this.ref.get(word)) / ((float) c);
|
|
min = Math.min(min, q);
|
|
max = Math.max(max, q);
|
|
counts.put(word, q);
|
|
}
|
|
}
|
|
if (max > min) for (Map.Entry<String, Float> ce: counts.entrySet()) {
|
|
result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min)));
|
|
}
|
|
return this.ref;
|
|
}
|
|
|
|
public void addTopic(final String[] words) {
|
|
String word;
|
|
for (final String w : words) {
|
|
word = w.toLowerCase();
|
|
if (word.length() > 2 &&
|
|
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 &&
|
|
!query.queryHashes.has(Word.word2hash(word)) &&
|
|
word.matches("[a-z]+") &&
|
|
!Switchboard.badwords.contains(word) &&
|
|
!Switchboard.stopwords.contains(word)) {
|
|
ref.inc(word);
|
|
}
|
|
}
|
|
}
|
|
|
|
protected void addTopics(final ResultEntry resultEntry) {
|
|
// take out relevant information for reference computation
|
|
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
|
|
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
|
|
final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
|
|
|
|
// add references
|
|
//addTopic(urlcomps);
|
|
addTopic(descrcomps);
|
|
}
|
|
|
|
public ScoreMap<String> getAuthorNavigator() {
|
|
// create a list of words that had been computed by statistics over all
|
|
// words that appeared in the url or the description of all urls
|
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ConcurrentScoreMap<String>();
|
|
if (this.authorNavigator.sizeSmaller(2)) this.authorNavigator.clear(); // navigators with one entry are not useful
|
|
return this.authorNavigator;
|
|
}
|
|
|
|
}
|