yacy_search_server/source/net/yacy/search/ranking/ReferenceOrder.java
Michael Peter Christen e5b3c172ff removed hack which translated Solr documents to virtual RWI entries
which had been then mixed with remote RWIs. Now these Solr documents are
feeded into the result set as they appear during local and remote
search. That makes the search much faster.
2012-10-17 17:45:41 +02:00

299 lines
18 KiB
Java

// ReferenceOrder.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.ranking;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ByteBuffer;
public class ReferenceOrder {
private static int cores = Runtime.getRuntime().availableProcessors();
private int maxdomcount;
private WordReferenceVars min, max;
private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic
private final RankingProfile ranking;
private final byte[] language;
public ReferenceOrder(final RankingProfile profile, final byte[] language) {
this.min = null;
this.max = null;
this.ranking = profile;
this.doms = new ConcurrentScoreMap<String>();
this.maxdomcount = 0;
this.language = language;
}
public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container, long maxtime) {
final LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
int threads = cores;
if (container.size() < 100) threads = 2;
final Thread distributor = new NormalizeDistributor(container, out, threads, maxtime);
distributor.start();
// return the resulting queue while the processing queues are still working
return out;
}
private final class NormalizeDistributor extends Thread {
ReferenceContainer<WordReference> container;
LinkedBlockingQueue<WordReferenceVars> out;
private final int threads;
private final long maxtime;
public NormalizeDistributor(final ReferenceContainer<WordReference> container, final LinkedBlockingQueue<WordReferenceVars> out, final int threads, final long maxtime) {
this.container = container;
this.out = out;
this.threads = threads;
this.maxtime = maxtime;
}
@Override
public void run() {
// transform the reference container into a stream of parsed entries
final BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(this.container, this.maxtime);
// start the transformation threads
final Semaphore termination = new Semaphore(this.threads);
final NormalizeWorker[] worker = new NormalizeWorker[this.threads];
for (int i = 0; i < this.threads; i++) {
worker[i] = new NormalizeWorker(this.out, termination, this.maxtime);
worker[i].start();
}
// fill the queue
WordReferenceVars iEntry;
int p = 0;
long timeout = System.currentTimeMillis() + this.maxtime;
try {
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
worker[p % this.threads].add(iEntry);
p++;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("NormalizeDistributor", "adding of decoded rows to workers ended with timeout = " + this.maxtime);
}
}
} catch (final InterruptedException e) {
}
// insert poison to stop the queues
for (int i = 0; i < this.threads; i++) worker[i].add(WordReferenceVars.poison);
// wait for termination but not too long to make it possible that this
// is called from outside with a join to get some normalization results
// before going on
for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (final InterruptedException e) {}
}
}
/**
* normalize ranking: find minimum and maximum of separate ranking criteria
*/
private class NormalizeWorker extends Thread {
private final BlockingQueue<WordReferenceVars> out;
private final Semaphore termination;
private final BlockingQueue<WordReferenceVars> decodedEntries;
private final long maxtime;
public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, final Semaphore termination, long maxtime) {
this.out = out;
this.termination = termination;
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
this.maxtime = maxtime;
}
public void add(final WordReferenceVars entry) {
try {
this.decodedEntries.put(entry);
} catch (final InterruptedException e) {
}
}
@Override
public void run() {
try {
WordReferenceVars iEntry;
final Map<String, Integer> doms0 = new HashMap<String, Integer>();
String dom;
Integer count;
final Integer int1 = 1;
long timeout = System.currentTimeMillis() + this.maxtime;
while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
// find min/max
if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
// update domcount
dom = iEntry.hosthash();
count = doms0.get(dom);
if (count == null) {
doms0.put(dom, int1);
} else {
doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
}
if (System.currentTimeMillis() > timeout) {
Log.logWarning("NormalizeWorker", "normlization of decoded rows ended with timeout = " + this.maxtime);
break;
}
}
// update domain score
Map.Entry<String, Integer> entry;
final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
while (di.hasNext()) {
entry = di.next();
ReferenceOrder.this.doms.inc(entry.getKey(), (entry.getValue()).intValue());
}
if (!ReferenceOrder.this.doms.isEmpty()) ReferenceOrder.this.maxdomcount = ReferenceOrder.this.doms.getMaxScore();
} catch (final InterruptedException e) {
Log.logException(e);
} catch (final Exception e) {
Log.logException(e);
} finally {
// insert poison to signal the termination to next queue
try {
this.termination.acquire();
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
} catch (final InterruptedException e) {}
}
}
}
public int authority(final String hostHash) {
assert hostHash.length() == 6;
return (this.doms.get(hostHash) << 8) / (1 + this.maxdomcount);
}
/**
* return the ranking of a given word entry
* @param t
* @return a ranking: the higher the number, the better is the ranking
*/
public long cardinal(final WordReference t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
assert this.min != null;
assert this.max != null;
assert t != null;
assert this.ranking != null;
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
final int maxmaxpos = this.max.maxposition();
final int minminpos = this.min.minposition();
final long r =
((256 - DigestURI.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
+ ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.urlhash()) << 4)) << this.ranking.coeff_ybr) : 0)
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
+ ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)
+ ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date)
+ ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle)
+ ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext)
+ ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext)
+ ((this.max.llocal() == this.min.llocal()) ? 0 : (((t.llocal() - this.min.llocal() ) << 8) / (this.max.llocal() - this.min.llocal()) ) << this.ranking.coeff_llocal)
+ ((this.max.lother() == this.min.lother()) ? 0 : (((t.lother() - this.min.lother() ) << 8) / (this.max.lother() - this.min.lother()) ) << this.ranking.coeff_lother)
+ ((this.max.hitcount() == this.min.hitcount()) ? 0 : (((t.hitcount() - this.min.hitcount() ) << 8) / (this.max.hitcount() - this.min.hitcount()) ) << this.ranking.coeff_hitcount)
+ tf
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
return r; // the higher the number the better the ranking.
}
public long cardinal(final URIMetadataNode t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
assert t != null;
assert this.ranking != null;
final long r =
((256 - DigestURI.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength)
+ ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.hash()) << 4)) << this.ranking.coeff_ybr) : 0)
+ ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength)
+ (t.virtualAge() << this.ranking.coeff_date)
+ (t.wordsintitle()<< this.ranking.coeff_wordsintitle)
+ (t.wordCount() << this.ranking.coeff_wordsintext)
+ (t.llocal() << this.ranking.coeff_llocal)
+ (t.lother() << this.ranking.coeff_lother)
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0);
return r; // the higher the number the better the ranking.
}
}