mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
99bf0b8e41
divided that class into three parts: - the peers object is now hosted by the plasmaSwitchboard - the crawler elements are now in a new class, crawler.CrawlerSwitchboard - the index elements are core of the new segment data structure, which is a bundle of different indexes for the full text and (in the future) navigation indexes and the metadata store. The new class is now in kelondro.text.Segment The refactoring is inspired by the roadmap to create index segments, the option to host different indexes on one peer. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5990 6c8d7289-2bf4-0310-a012-ef5d649a1542
563 lines
25 KiB
Java
563 lines
25 KiB
Java
// plasmaSearchRankingProcess.java
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 07.11.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.plasma;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.Arrays;
|
|
import java.util.Comparator;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.TreeSet;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|
import de.anomic.kelondro.index.BinSearch;
|
|
import de.anomic.kelondro.order.Digest;
|
|
import de.anomic.kelondro.text.Reference;
|
|
import de.anomic.kelondro.text.ReferenceContainer;
|
|
import de.anomic.kelondro.text.ReferenceOrder;
|
|
import de.anomic.kelondro.text.Segment;
|
|
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
|
|
import de.anomic.kelondro.text.referencePrototype.WordReference;
|
|
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
|
|
import de.anomic.kelondro.util.ScoreCluster;
|
|
import de.anomic.kelondro.util.SortStack;
|
|
import de.anomic.kelondro.util.FileUtils;
|
|
import de.anomic.plasma.parser.Word;
|
|
import de.anomic.plasma.parser.Condenser;
|
|
import de.anomic.server.serverProfiling;
|
|
import de.anomic.yacy.yacyURL;
|
|
|
|
public final class plasmaSearchRankingProcess {
|
|
|
|
public static BinSearch[] ybrTables = null; // block-rank tables
|
|
public static final int maxYBR = 3; // the lower this value, the faster the search
|
|
private static boolean useYBR = true;
|
|
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
|
|
|
|
private final SortStack<WordReferenceVars> stack;
|
|
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
|
private final HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
|
|
private final plasmaSearchQuery query;
|
|
private final int maxentries;
|
|
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
|
|
private final ReferenceOrder order;
|
|
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
|
|
private final ScoreCluster<String> ref; // reference score computation for the commonSense heuristic
|
|
private final int[] flagcount; // flag counter
|
|
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
|
|
private final Segment indexSegment;
|
|
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
|
|
private final int[] domZones;
|
|
private HashMap<String, hoststat> hostNavigator;
|
|
|
|
public plasmaSearchRankingProcess(
|
|
final Segment indexSegment,
|
|
final plasmaSearchQuery query,
|
|
final int maxentries,
|
|
final int concurrency) {
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
|
this.localSearchContainerMaps = null;
|
|
this.stack = new SortStack<WordReferenceVars>(maxentries);
|
|
this.doubleDomCache = new HashMap<String, SortStack<WordReferenceVars>>();
|
|
this.handover = new HashMap<String, String>();
|
|
this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang);
|
|
this.query = query;
|
|
this.maxentries = maxentries;
|
|
this.remote_peerCount = 0;
|
|
this.remote_indexCount = 0;
|
|
this.remote_resourceSize = 0;
|
|
this.local_resourceSize = 0;
|
|
this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency);
|
|
this.ref = new ScoreCluster<String>();
|
|
this.misses = new TreeSet<String>();
|
|
this.indexSegment = indexSegment;
|
|
this.flagcount = new int[32];
|
|
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
|
this.domZones = new int[8];
|
|
this.hostNavigator = new HashMap<String, hoststat>();
|
|
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
|
}
|
|
|
|
public long ranking(final WordReferenceVars word) {
|
|
return order.cardinal(word);
|
|
}
|
|
|
|
public int[] zones() {
|
|
return this.domZones;
|
|
}
|
|
|
|
public void execQuery() {
|
|
|
|
long timer = System.currentTimeMillis();
|
|
this.localSearchContainerMaps = indexSegment.index().searchTerm(query.queryHashes, query.excludeHashes, null);
|
|
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
|
|
|
|
// join and exclude the local result
|
|
timer = System.currentTimeMillis();
|
|
final ReferenceContainer<WordReference> index =
|
|
ReferenceContainer.joinExcludeContainers(
|
|
Segment.wordReferenceFactory,
|
|
this.localSearchContainerMaps[0].values(),
|
|
this.localSearchContainerMaps[1].values(),
|
|
query.maxDistance);
|
|
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false);
|
|
if (index.size() == 0) {
|
|
return;
|
|
}
|
|
|
|
insertRanked(index, true, index.size());
|
|
}
|
|
|
|
public void insertRanked(final ReferenceContainer<WordReference> index, final boolean local, final int fullResource) {
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
|
|
assert (index != null);
|
|
if (index.size() == 0) return;
|
|
if (local) {
|
|
this.local_resourceSize += fullResource;
|
|
} else {
|
|
this.remote_resourceSize += fullResource;
|
|
this.remote_peerCount++;
|
|
}
|
|
|
|
long timer = System.currentTimeMillis();
|
|
|
|
// normalize entries
|
|
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
|
|
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
|
|
|
|
// iterate over normalized entries and select some that are better than currently stored
|
|
timer = System.currentTimeMillis();
|
|
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
|
WordReferenceVars iEntry;
|
|
Long r;
|
|
hoststat hs;
|
|
String domhash;
|
|
while (i.hasNext()) {
|
|
iEntry = i.next();
|
|
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
|
|
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
|
|
|
// increase flag counts
|
|
for (int j = 0; j < 32; j++) {
|
|
if (iEntry.flags().get(j)) {flagcount[j]++;}
|
|
}
|
|
|
|
// kick out entries that are too bad according to current findings
|
|
r = Long.valueOf(order.cardinal(iEntry));
|
|
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
|
|
|
|
// check constraints
|
|
if (!testFlags(iEntry)) continue;
|
|
|
|
// check document domain
|
|
if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
|
|
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
|
|
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
|
|
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
|
|
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
|
|
}
|
|
|
|
// check tld domain
|
|
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
|
|
// filter out all tld that do not match with wanted tld domain
|
|
continue;
|
|
}
|
|
|
|
// check site constraints
|
|
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
|
|
// filter out all domains that do not match with the site constraint
|
|
continue;
|
|
}
|
|
|
|
// count domZones
|
|
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
|
|
|
|
// get statistics for host navigator
|
|
domhash = iEntry.urlHash.substring(6);
|
|
hs = this.hostNavigator.get(domhash);
|
|
if (hs == null) {
|
|
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
|
|
} else {
|
|
hs.inc();
|
|
}
|
|
|
|
// insert
|
|
if ((maxentries < 0) || (stack.size() < maxentries)) {
|
|
// in case that we don't have enough yet, accept any new entry
|
|
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
|
|
stack.push(iEntry, r);
|
|
} else {
|
|
// if we already have enough entries, insert only such that are necessary to get a better result
|
|
if (stack.bottom(r.longValue())) {
|
|
continue;
|
|
}
|
|
// double-check
|
|
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
|
|
stack.push(iEntry, r);
|
|
}
|
|
|
|
// increase counter for statistics
|
|
if (!local) this.remote_indexCount++;
|
|
}
|
|
|
|
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
|
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
|
|
}
|
|
|
|
public class hoststat {
|
|
public int count;
|
|
public String hashsample;
|
|
public hoststat(String urlhash) {
|
|
this.count = 1;
|
|
this.hashsample = urlhash;
|
|
}
|
|
public void inc() {
|
|
this.count++;
|
|
}
|
|
}
|
|
|
|
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
|
|
public int compare(hoststat o1, hoststat o2) {
|
|
if (o1.count < o2.count) return 1;
|
|
if (o2.count < o1.count) return -1;
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
public class hostnaventry {
|
|
public int count;
|
|
public String host;
|
|
public hostnaventry(String host, int count) {
|
|
this.host = host;
|
|
this.count = count;
|
|
}
|
|
}
|
|
|
|
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
|
|
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
|
|
Arrays.sort(hsa, hscomp);
|
|
int rc = Math.min(maxentries, hsa.length);
|
|
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
|
|
URLMetadataRow mr;
|
|
yacyURL url;
|
|
for (int i = 0; i < rc; i++) {
|
|
mr = indexSegment.metadata().load(hsa[i].hashsample, null, 0);
|
|
if (mr == null) continue;
|
|
url = mr.metadata().url();
|
|
if (url == null) continue;
|
|
result.add(new hostnaventry(url.getHost(), hsa[i].count));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private boolean testFlags(final WordReference ientry) {
|
|
if (query.constraint == null) return true;
|
|
// test if ientry matches with filter
|
|
// if all = true: let only entries pass that has all matching bits
|
|
// if all = false: let all entries pass that has at least one matching bit
|
|
if (query.allofconstraint) {
|
|
for (int i = 0; i < 32; i++) {
|
|
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
|
|
}
|
|
return true;
|
|
}
|
|
for (int i = 0; i < 32; i++) {
|
|
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public Map<byte[], ReferenceContainer<WordReference>>[] searchContainerMaps() {
|
|
// direct access to the result maps is needed for abstract generation
|
|
// this is only available if execQuery() was called before
|
|
return localSearchContainerMaps;
|
|
}
|
|
|
|
// todo:
|
|
// - remove redundant urls (sub-path occurred before)
|
|
// - move up shorter urls
|
|
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
|
|
|
|
|
|
private SortStack<WordReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) {
|
|
// returns from the current RWI list the best entry and removes this entry from the list
|
|
SortStack<WordReferenceVars> m;
|
|
SortStack<WordReferenceVars>.stackElement rwi;
|
|
while (stack.size() > 0) {
|
|
rwi = stack.pop();
|
|
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
|
|
if (!skipDoubleDom) return rwi;
|
|
// check doubledom
|
|
final String domhash = rwi.element.metadataHash().substring(6);
|
|
m = this.doubleDomCache.get(domhash);
|
|
if (m == null) {
|
|
// first appearance of dom
|
|
m = new SortStack<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
|
|
this.doubleDomCache.put(domhash, m);
|
|
return rwi;
|
|
}
|
|
// second appearances of dom
|
|
m.push(rwi);
|
|
}
|
|
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
|
// find best entry from all caches
|
|
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
|
SortStack<WordReferenceVars>.stackElement bestEntry = null;
|
|
SortStack<WordReferenceVars>.stackElement o;
|
|
while (i.hasNext()) {
|
|
m = i.next();
|
|
if (m == null) continue;
|
|
if (m.size() == 0) continue;
|
|
if (bestEntry == null) {
|
|
bestEntry = m.top();
|
|
continue;
|
|
}
|
|
o = m.top();
|
|
if (o.weight.longValue() < bestEntry.weight.longValue()) {
|
|
bestEntry = o;
|
|
}
|
|
}
|
|
if (bestEntry == null) return null;
|
|
// finally remove the best entry from the doubledom cache
|
|
m = this.doubleDomCache.get(bestEntry.element.metadataHash().substring(6));
|
|
o = m.pop();
|
|
assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash()) : "bestEntry.element.metadataHash() = " + bestEntry.element.metadataHash() + ", o.element.metadataHash() = " + o.element.metadataHash();
|
|
return bestEntry;
|
|
}
|
|
|
|
public URLMetadataRow bestURL(final boolean skipDoubleDom) {
|
|
// returns from the current RWI list the best URL entry and removes this entry from the list
|
|
while ((stack.size() > 0) || (size() > 0)) {
|
|
if (((stack.size() == 0) && (size() == 0))) break;
|
|
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom);
|
|
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
|
|
final URLMetadataRow u = indexSegment.metadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
|
|
if (u != null) {
|
|
final URLMetadataRow.Components metadata = u.metadata();
|
|
if (metadata.url() != null) {
|
|
String urlstring = metadata.url().toNormalform(true, true);
|
|
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
|
|
this.handover.put(u.hash(), metadata.url().toNormalform(true, false)); // remember that we handed over this url
|
|
return u;
|
|
}
|
|
}
|
|
misses.add(obrwi.element.metadataHash());
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public int size() {
|
|
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
|
|
int c = stack.size();
|
|
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
|
while (i.hasNext()) c += i.next().size();
|
|
return c;
|
|
}
|
|
|
|
public int[] flagCount() {
|
|
return flagcount;
|
|
}
|
|
|
|
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
|
|
|
|
public int filteredCount() {
|
|
// the number of index entries that are considered as result set
|
|
return this.stack.size();
|
|
}
|
|
|
|
public int getRemoteIndexCount() {
|
|
// the number of result contributions from all the remote peers
|
|
return this.remote_indexCount;
|
|
}
|
|
|
|
public int getRemotePeerCount() {
|
|
// the number of remote peers that have contributed
|
|
return this.remote_peerCount;
|
|
}
|
|
|
|
public int getRemoteResourceSize() {
|
|
// the number of all hits in all the remote peers
|
|
return this.remote_resourceSize;
|
|
}
|
|
|
|
public int getLocalResourceSize() {
|
|
// the number of hits in the local peer (index size, size of the collection in the own index)
|
|
return this.local_resourceSize;
|
|
}
|
|
|
|
public Reference remove(final String urlHash) {
|
|
final SortStack<WordReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
|
|
if (se == null) return null;
|
|
urlhashes.remove(urlHash);
|
|
return se.element;
|
|
}
|
|
|
|
public Iterator<String> miss() {
|
|
return this.misses.iterator();
|
|
}
|
|
|
|
public Set<String> getReferences(final int count) {
|
|
// create a list of words that had been computed by statistics over all
|
|
// words that appeared in the url or the description of all urls
|
|
final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
|
|
final TreeSet<String> s = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
|
|
for (int i = 0; i < refs.length; i++) {
|
|
s.add((String) refs[i]);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
public void addReferences(final String[] words) {
|
|
String word;
|
|
for (int i = 0; i < words.length; i++) {
|
|
word = words[i].toLowerCase();
|
|
if ((word.length() > 2) &&
|
|
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
|
|
(!(query.queryHashes.contains(Word.word2hash(word)))))
|
|
ref.incScore(word);
|
|
}
|
|
}
|
|
|
|
protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) {
|
|
// take out relevant information for reference computation
|
|
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
|
|
final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
|
|
final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
|
|
|
|
// add references
|
|
addReferences(urlcomps);
|
|
addReferences(descrcomps);
|
|
}
|
|
|
|
public ReferenceOrder getOrder() {
|
|
return this.order;
|
|
}
|
|
|
|
public static void loadYBR(final File rankingPath, final int count) {
|
|
// load ranking tables
|
|
if (rankingPath.exists()) {
|
|
ybrTables = new BinSearch[count];
|
|
String ybrName;
|
|
File f;
|
|
try {
|
|
for (int i = 0; i < count; i++) {
|
|
ybrName = "YBR-4-" + Digest.encodeHex(i, 2) + ".idx";
|
|
f = new File(rankingPath, ybrName);
|
|
if (f.exists()) {
|
|
ybrTables[i] = new BinSearch(FileUtils.read(f), 6);
|
|
} else {
|
|
ybrTables[i] = null;
|
|
}
|
|
}
|
|
} catch (final IOException e) {
|
|
ybrTables = null;
|
|
}
|
|
} else {
|
|
ybrTables = null;
|
|
}
|
|
}
|
|
|
|
public static boolean canUseYBR() {
|
|
return ybrTables != null;
|
|
}
|
|
|
|
public static boolean isUsingYBR() {
|
|
return useYBR;
|
|
}
|
|
|
|
public static void switchYBR(final boolean usage) {
|
|
useYBR = usage;
|
|
}
|
|
|
|
public static int ybr(final String urlHash) {
|
|
// returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
|
|
if (ybrTables == null) return 15;
|
|
if (!(useYBR)) return 15;
|
|
final String domHash = urlHash.substring(6);
|
|
final int m = Math.min(maxYBR, ybrTables.length);
|
|
for (int i = 0; i < m; i++) {
|
|
if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) {
|
|
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
|
|
return i;
|
|
}
|
|
}
|
|
//System.out.println("NOT FOUND: " + urlHash);
|
|
return 15;
|
|
}
|
|
|
|
public long postRanking(
|
|
final Set<String> topwords,
|
|
final plasmaSearchEvent.ResultEntry rentry,
|
|
final int position) {
|
|
|
|
long r = (255 - position) << 8;
|
|
|
|
// for media search: prefer pages with many links
|
|
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
|
|
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
|
|
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
|
|
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
|
|
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
final String urlstring = rentry.url().toNormalform(true, true);
|
|
final String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
|
|
final String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
|
|
for (int j = 0; j < urlcomps.length; j++) {
|
|
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
|
|
}
|
|
for (int j = 0; j < descrcomps.length; j++) {
|
|
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
|
|
final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
|
|
final Iterator<byte[]> shi = query.queryHashes.iterator();
|
|
byte[] queryhash;
|
|
while (shi.hasNext()) {
|
|
queryhash = shi.next();
|
|
if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
|
|
if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
}
|