removed hack which translated Solr documents to virtual RWI entries

which had been then mixed with remote RWIs. Now these Solr documents are
feeded into the result set as they appear during local and remote
search. That makes the search much faster.
This commit is contained in:
Michael Peter Christen 2012-10-17 17:45:41 +02:00
parent 6017691522
commit e5b3c172ff
12 changed files with 322 additions and 223 deletions

View File

@ -34,7 +34,6 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.EventChannel;
@ -87,7 +86,7 @@ public final class transferURL {
int doublecheck = 0;
// read the urls from the other properties and store
String urls;
URIMetadata lEntry;
URIMetadataRow lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();

View File

@ -183,6 +183,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
@Override
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
if (this.server == null) return;
try {
synchronized (this.server) {
this.server.add(solrdoc, this.commitWithinMs);

View File

@ -97,17 +97,17 @@ public final class ResultURLs {
}
public static void stack(
final URIMetadata e,
final URIMetadata urlEntry,
final byte[] initiatorHash,
final byte[] executorHash,
final EventOrigin stackType) {
// assert initiatorHash != null; // null == proxy !
assert executorHash != null;
if (e == null) { return; }
if (urlEntry == null) { return; }
try {
final Map<String, InitExecEntry> resultStack = getStack(stackType);
if (resultStack != null) {
resultStack.put(ASCII.String(e.hash()), new InitExecEntry(initiatorHash, executorHash));
resultStack.put(ASCII.String(urlEntry.hash()), new InitExecEntry(initiatorHash, executorHash));
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
@ -116,7 +116,7 @@ public final class ResultURLs {
try {
final ScoreMap<String> domains = getDomains(stackType);
if (domains != null) {
domains.inc(e.url().getHost());
domains.inc(urlEntry.url().getHost());
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString());

View File

@ -28,6 +28,7 @@ import java.util.Date;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.SolrType;
@ -123,7 +124,7 @@ public class URIMetadataNode implements URIMetadata {
}
@SuppressWarnings("unchecked")
private ArrayList<String> getArrayList(YaCySchema field) {
private ArrayList<String> getStringList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.name());
@ -136,6 +137,20 @@ public class URIMetadataNode implements URIMetadata {
return a;
}
@SuppressWarnings("unchecked")
private ArrayList<Integer> getIntList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.integer;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<Integer>(0);
if (r instanceof ArrayList) {
return (ArrayList<Integer>) r;
}
ArrayList<Integer> a = new ArrayList<Integer>(1);
a.add((Integer) r);
return a;
}
@Override
public byte[] hash() {
return this.hash;
@ -165,7 +180,7 @@ public class URIMetadataNode implements URIMetadata {
@Override
public String dc_title() {
ArrayList<String> a = getArrayList(YaCySchema.title);
ArrayList<String> a = getStringList(YaCySchema.title);
if (a == null || a.size() == 0) return "";
return a.get(0);
}
@ -233,7 +248,7 @@ public class URIMetadataNode implements URIMetadata {
@Override
public char doctype() {
ArrayList<String> a = getArrayList(YaCySchema.content_type);
ArrayList<String> a = getStringList(YaCySchema.content_type);
if (a == null || a.size() == 0) return Response.docType(url());
return Response.docType(a.get(0));
}
@ -248,7 +263,7 @@ public class URIMetadataNode implements URIMetadata {
@Override
public byte[] referrerHash() {
ArrayList<String> referrer = getArrayList(YaCySchema.referrer_id_txt);
ArrayList<String> referrer = getStringList(YaCySchema.referrer_id_txt);
if (referrer == null || referrer.size() == 0) return null;
return ASCII.getBytes(referrer.get(0));
}
@ -319,6 +334,20 @@ public class URIMetadataNode implements URIMetadata {
return this.appc;
}
public int virtualAge() {
return MicroDate.microDateDays(moddate());
}
public int wordsintitle() {
ArrayList<Integer> x = getIntList(YaCySchema.title_words_val);
if (x == null || x.size() == 0) return 0;
return x.get(0).intValue();
}
public int urllength() {
return getInt(YaCySchema.url_chars_i);
}
@Override
public String snippet() {
return this.snippet;
@ -326,7 +355,7 @@ public class URIMetadataNode implements URIMetadata {
@Override
public String[] collections() {
ArrayList<String> a = getArrayList(YaCySchema.collection_sxt);
ArrayList<String> a = getStringList(YaCySchema.collection_sxt);
return a.toArray(new String[a.size()]);
}

View File

@ -76,7 +76,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
/**
* object for termination of concurrent blocking queue processing
*/
public static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
protected static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
// static properties
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
@ -114,7 +114,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
private final Row.Entry entry;
public WordReferenceRow(
protected WordReferenceRow(
final byte[] urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
@ -206,13 +206,11 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry = urlEntryRow.newEntry(external, true);
}
public WordReferenceRow(final byte[] row) {
private WordReferenceRow(final byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}
public WordReferenceRow(final Row.Entry rentry) {
protected WordReferenceRow(final Row.Entry rentry) {
// no cloning is necessary since there is no further manipulation after this initial instantiation
this.entry = rentry;
}
@ -249,10 +247,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
}
public long freshUntil() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
}
@Override
public int hitcount() {
return (0xff & this.entry.getColByte(col_hitcount));
@ -263,11 +257,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
return new ArrayList<Integer>(0);
}
public int position(final int p) {
assert p == 0 : "p = " + p;
return (int) this.entry.getColLong(col_posintext);
}
@Override
public int posinphrase() {
return (0xff & this.entry.getColByte(col_posinphrase));

View File

@ -78,7 +78,6 @@ import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
@ -88,7 +87,6 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -767,7 +765,7 @@ public final class Protocol
// insert results to containers
int term = count;
for ( final URIMetadata urlEntry : result.links ) {
for ( final URIMetadataRow urlEntry : result.links ) {
if ( term-- <= 0 ) {
break; // do not process more that requested (in case that evil peers fill us up with rubbish)
}
@ -883,7 +881,7 @@ public final class Protocol
public Map<byte[], Integer> indexcount; //
public long searchtime; // time that the peer actually spent to create the result
public String[] references; // search hints, the top-words
public List<URIMetadata> links; // LURLs of search
public List<URIMetadataRow> links; // LURLs of search
public Map<byte[], String> indexabstract; // index abstracts, a collection of url-hashes per word
public SearchResult(
@ -1003,14 +1001,14 @@ public final class Protocol
}
}
this.references = resultMap.get("references").split(",");
this.links = new ArrayList<URIMetadata>(this.urlcount);
this.links = new ArrayList<URIMetadataRow>(this.urlcount);
for ( int n = 0; n < this.urlcount; n++ ) {
// get one single search result
final String resultLine = resultMap.get("resource" + n);
if ( resultLine == null ) {
continue;
}
final URIMetadata urlEntry = URIMetadataRow.importEntry(resultLine);
final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine);
if ( urlEntry == null ) {
continue;
}
@ -1027,8 +1025,6 @@ public final class Protocol
final Seed target,
final Blacklist blacklist) {
final HandleSet wordhashes = event.getQuery().query_include_hashes;
if (event.getQuery().queryString == null || event.getQuery().queryString.length() == 0) {
return -1; // we cannot query solr only with word hashes, there is no clear text string
}
@ -1064,14 +1060,9 @@ public final class Protocol
}
// evaluate result
List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
if (docList.size() > 0) {// create containers
Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;
final List<ReferenceContainer<WordReference>> container = new ArrayList<ReferenceContainer<WordReference>>(wordhashes.size());
for (byte[] hash: wordhashes) {
try {
container.add(ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, hash, count));
} catch (SpaceExceededException e) {} // throws SpaceExceededException
}
int term = count;
for (final SolrDocument doc: docList) {
@ -1122,27 +1113,17 @@ public final class Protocol
}
}
// we create virtual word references here which are necessary to feed search results into retrieval process
Reference entry = new WordReferenceVars(urlEntry);
// add the url entry to the word indexes
for ( final ReferenceContainer<WordReference> c : container ) {
try {
c.add(entry);
} catch ( final SpaceExceededException e ) {
Log.logException(e);
break;
}
}
container.add(urlEntry);
}
if (localsearch) {
event.rankingProcess.add(container.get(0), true, "localpeer", docList.size(), time);
event.rankingProcess.add(container, true, "localpeer", docList.size());
event.rankingProcess.addFinalize();
event.rankingProcess.addExpectedRemoteReferences(-count);
Network.log.logInfo("local search (solr): localpeer sent " + container.get(0).size() + "/" + docList.size() + " references");
} else {
event.rankingProcess.add(container.get(0), false, target.getName() + "/" + target.hash, docList.size(), time);
event.rankingProcess.add(container, false, target.getName() + "/" + target.hash, docList.size());
event.rankingProcess.addFinalize();
event.rankingProcess.addExpectedRemoteReferences(-count);
Network.log.logInfo("remote search (solr): peer " + target.getName() + " sent " + container.get(0).size() + "/" + docList.size() + " references");

View File

@ -148,6 +148,7 @@ import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -2588,8 +2589,7 @@ public final class Switchboard extends serverSwitch
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX
URIMetadata newEntry = null;
newEntry =
URIMetadataRow newEntry =
this.index.storeDocument(
url,
referrerURL,

View File

@ -56,7 +56,6 @@ import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -341,7 +340,7 @@ public class Segment {
if (this.termIndex != null) this.termIndex.add(termHash, entry);
}
public URIMetadata storeDocument(
public URIMetadataRow storeDocument(
final DigestURI url,
final DigestURI referrerURL,
Date modDate,
@ -368,7 +367,7 @@ public class Segment {
// STORE URL TO LOADED-URL-DB
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
char docType = Response.docType(document.dc_format());
final URIMetadata metadata = new URIMetadataRow(
final URIMetadataRow metadata = new URIMetadataRow(
url, // URL
dc_title, // document description
document.dc_creator(), // author

View File

@ -30,6 +30,7 @@ import java.util.Comparator;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.concurrent.BlockingQueue;
@ -67,6 +68,7 @@ import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.peers.graphics.ProfilingGraph;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.FilterEngine;
@ -97,9 +99,9 @@ public final class RWIProcess extends Thread
private int remote_indexCount;
private int remote_peerCount;
private int local_indexCount;
private final AtomicInteger maxExpectedRemoteReferences, expectedRemoteReferences,
receivedRemoteReferences;
private final AtomicInteger maxExpectedRemoteReferences, expectedRemoteReferences, receivedRemoteReferences;
private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack;
private final AtomicInteger feedersAlive, feedersTerminated;
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
@ -126,7 +128,9 @@ public final class RWIProcess extends Thread
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.addRunning = true;
this.localSearchInclusion = null;
this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(query.snippetCacheStrategy == null || query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation, false);
int stackMaxsize = query.snippetCacheStrategy == null || query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation;
this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(stackMaxsize, false);
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(stackMaxsize, false);
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
this.query = query;
this.order = order;
@ -240,23 +244,16 @@ public final class RWIProcess extends Thread
}
public void add(
final ReferenceContainer<WordReference> index,
final List<URIMetadataNode> index,
final boolean local,
final String resourceName,
final int fullResource,
final long maxtime) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
//Log.logInfo("RWIProcess", "added a container, size = " + index.size());
final int fullResource) {
this.addRunning = true;
assert (index != null);
if ( index.isEmpty() ) {
return;
}
if (index.isEmpty()) return;
if ( !local ) {
if (!local) {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.remote_resourceSize += fullResource;
this.remote_peerCount++;
@ -265,7 +262,6 @@ public final class RWIProcess extends Thread
long timer = System.currentTimeMillis();
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime);
int is = index.size();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
@ -279,77 +275,37 @@ public final class RWIProcess extends Thread
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final boolean nav_hosts =
this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
// apply all constraints
long timeout = System.currentTimeMillis() + maxtime;
try {
WordReferenceVars iEntry;
final String pattern = this.query.urlMask.pattern();
final boolean httpPattern = pattern.equals("http://.*");
final boolean noHttpButProtocolPattern =
pattern.equals("https://.*")
|| pattern.equals("ftp://.*")
|| pattern.equals("smb://.*")
|| pattern.equals("file://.*");
long remaining;
pollloop: while ( true ) {
remaining = timeout - System.currentTimeMillis();
if (remaining <= 0) {
Log.logWarning("RWIProcess", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break;
}
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
if ( iEntry == null ) {
Log.logWarning("RWIProcess", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break pollloop;
}
if ( iEntry == WordReferenceVars.poison ) {
break pollloop;
}
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
pollloop: for (URIMetadataNode iEntry: index) {
// doublecheck for urls
if (this.urlhashes.has(iEntry.urlhash())) {
if (this.urlhashes.has(iEntry.hash())) {
continue pollloop;
}
// increase flag counts
for ( int j = 0; j < 32; j++ ) {
if ( iEntry.flags().get(j) ) {
this.flagcount[j]++;
}
if (iEntry.flags().get(j)) this.flagcount[j]++;
}
// check constraints
if ( !testFlags(iEntry) ) {
continue pollloop;
}
Bitfield flags = iEntry.flags();
if (!testFlags(flags)) continue pollloop;
// check document domain
if ( this.query.contentdom.getCode() > 0 ) {
if ( (this.query.contentdom == ContentDomain.AUDIO)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) {
if (this.query.contentdom.getCode() > 0 &&
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.VIDEO)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasvideo))) ) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.IMAGE)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasimage))) ) {
continue pollloop;
}
if ( (this.query.contentdom == ContentDomain.APP)
&& (!(iEntry.flags().get(Condenser.flag_cat_hasapp))) ) {
continue pollloop;
}
}
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// check site constraints
final String hosthash = iEntry.hosthash();
@ -358,10 +314,172 @@ public final class RWIProcess extends Thread
continue pollloop;
}
} else {
if ( !hosthash.equals(this.query.sitehash) ) {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.hash());
}
// check protocol
if ( !this.query.urlMask_isCatchall ) {
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.hash());
if ( httpPattern && !httpFlagSet ) {
continue pollloop;
}
if ( noHttpButProtocolPattern && httpFlagSet ) {
continue pollloop;
}
}
// check vocabulary constraint
String subject = YaCyMetadata.hashURI(iEntry.hash());
Resource resource = JenaTripleStore.getResource(subject);
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
// all metatags must appear in the tags list
for (Tagging.Metatag metatag: this.query.metatags) {
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, metatag.getPredicate());
if (!ni.hasNext()) continue pollloop;
String tags = ni.next().toString();
if (tags.indexOf(metatag.getObject()) < 0) continue pollloop;
}
}
// add navigators using the triplestore
for (Map.Entry<String, String> v: this.taggingPredicates.entrySet()) {
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, v.getValue());
while (ni.hasNext()) {
String[] tags = ni.next().toString().split(",");
for (String tag: tags) {
ScoreMap<String> voc = this.vocabularyNavigator.get(v.getKey());
if (voc == null) {
voc = new ConcurrentScoreMap<String>();
this.vocabularyNavigator.put(v.getKey(), voc);
}
voc.inc(tag);
}
}
}
// finally extend the double-check and insert result to stack
this.urlhashes.putUnique(iEntry.hash());
rankingtryloop: while (true) {
try {
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
continue rankingtryloop;
}
}
// increase counter for statistics
if (local) this.local_indexCount++; else this.remote_indexCount++;
}
} catch ( final SpaceExceededException e ) {
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEvent.Type.PRESORT,
resourceName,
index.size(),
System.currentTimeMillis() - timer), false);
}
public void add(
final ReferenceContainer<WordReference> index,
final boolean local,
final String resourceName,
final int fullResource,
final long maxtime) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
//Log.logInfo("RWIProcess", "added a container, size = " + index.size());
this.addRunning = true;
assert (index != null);
if (index.isEmpty()) return;
if (!local) {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.remote_resourceSize += fullResource;
this.remote_peerCount++;
}
long timer = System.currentTimeMillis();
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime);
int is = index.size();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEvent.Type.NORMALIZING,
resourceName,
is,
System.currentTimeMillis() - timer), false);
if (!local) this.receivedRemoteReferences.addAndGet(is);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
// apply all constraints
long timeout = System.currentTimeMillis() + maxtime;
try {
WordReferenceVars iEntry;
final String pattern = this.query.urlMask.pattern();
final boolean httpPattern = pattern.equals("http://.*");
final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
long remaining;
pollloop: while ( true ) {
remaining = timeout - System.currentTimeMillis();
if (remaining <= 0) {
Log.logWarning("RWIProcess", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break;
}
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
if (iEntry == null) {
Log.logWarning("RWIProcess", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break pollloop;
}
if (iEntry == WordReferenceVars.poison) {
break pollloop;
}
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
// doublecheck for urls
if (this.urlhashes.has(iEntry.urlhash())) continue pollloop;
// increase flag counts
Bitfield flags = iEntry.flags();
for (int j = 0; j < 32; j++) {
if (flags.get(j)) this.flagcount[j]++;
}
// check constraints
if (!testFlags(flags)) continue pollloop;
// check document domain
if (this.query.contentdom.getCode() > 0 &&
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
continue pollloop;
}
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// check site constraints
final String hosthash = iEntry.hosthash();
if ( this.query.sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) continue pollloop;
} else {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
@ -371,14 +489,10 @@ public final class RWIProcess extends Thread
}
// check protocol
if ( !this.query.urlMask_isCatchall ) {
if (!this.query.urlMask_isCatchall) {
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
if ( httpPattern && !httpFlagSet ) {
continue pollloop;
}
if ( noHttpButProtocolPattern && httpFlagSet ) {
continue pollloop;
}
if (httpPattern && !httpFlagSet) continue pollloop;
if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
}
// check vocabulary constraint
@ -412,7 +526,7 @@ public final class RWIProcess extends Thread
// finally extend the double-check and insert result to stack
this.urlhashes.putUnique(iEntry.urlhash());
rankingtryloop: while ( true ) {
rankingtryloop: while (true) {
try {
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
@ -422,12 +536,7 @@ public final class RWIProcess extends Thread
}
}
// increase counter for statistics
if ( local ) {
this.local_indexCount++;
} else {
this.remote_indexCount++;
//}
}
if (local) this.local_indexCount++; else this.remote_indexCount++;
}
if (System.currentTimeMillis() >= timeout) Log.logWarning("RWIProcess", "rwi normalization ended with timeout = " + maxtime);
@ -464,25 +573,19 @@ public final class RWIProcess extends Thread
//(!this.remote || this.remote_indexCount > 0);
}
private boolean testFlags(final WordReference ientry) {
if ( this.query.constraint == null ) {
return true;
}
private boolean testFlags(final Bitfield flags) {
if (this.query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if ( this.query.allofconstraint ) {
if (this.query.allofconstraint) {
for ( int i = 0; i < 32; i++ ) {
if ( (this.query.constraint.get(i)) && (!ientry.flags().get(i)) ) {
return false;
}
if ((this.query.constraint.get(i)) && (!flags.get(i))) return false;
}
return true;
}
for ( int i = 0; i < 32; i++ ) {
if ( (this.query.constraint.get(i)) && (ientry.flags().get(i)) ) {
return true;
}
for (int i = 0; i < 32; i++) {
if ((this.query.constraint.get(i)) && (flags.get(i))) return true;
}
return false;
}
@ -493,17 +596,15 @@ public final class RWIProcess extends Thread
return this.localSearchInclusion;
}
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(
final boolean skipDoubleDom,
final long waitingtime) {
private URIMetadataNode takeRWI(final boolean skipDoubleDom, final long waitingtime) {
// returns from the current RWI list the best entry and removes this entry from the list
WeakPriorityBlockingQueue<WordReferenceVars> m;
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
WeakPriorityBlockingQueue.Element<URIMetadataNode> page;
// take one entry from the stack if there are entries on that stack or the feeding is not yet finished
try {
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
// wait some time if we did not get so much remote results so far to get a better ranking over remote results
// we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results
@ -514,30 +615,26 @@ public final class RWIProcess extends Thread
}
// loop as long as we can expect that we should get more results
final long timeout = System.currentTimeMillis() + waitingtime;
while ( ((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
(this.query.itemsPerPage < 1 ||
loops++ < this.query.itemsPerPage ||
(loops > 1000 && !this.doubleDomCache.isEmpty())) ) {
while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage || (loops > 1000 && !this.doubleDomCache.isEmpty()))) {
page = null;
rwi = null;
if ( waitingtime <= 0 ) {
rwi = this.addRunning ? this.stack.poll(waitingtime) : this.stack.poll();
page = this.addRunning ? this.nodeStack.poll(waitingtime) : this.nodeStack.poll();
if (page == null) rwi = this.addRunning ? this.stack.poll(waitingtime) : this.stack.poll();
} else {
timeoutloop: while ( System.currentTimeMillis() < timeout ) {
if ( feedingIsFinished() && this.stack.sizeQueue() == 0 ) {
break timeoutloop;
}
if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
page = this.nodeStack.poll(50);
if (page != null) break timeoutloop;
rwi = this.stack.poll(50);
if ( rwi != null ) {
break timeoutloop;
if (rwi != null) break timeoutloop;
}
}
}
if ( rwi == null ) {
//Log.logWarning("RWIProcess", "terminated takeRWI with rwi == null");
break;
}
if ( !skipDoubleDom ) {
//System.out.println("!skipDoubleDom");
return rwi;
if (page != null) return page.getElement();
if (rwi == null) break;
if (!skipDoubleDom) {
return this.query.getSegment().fulltext().getMetadata(rwi.getElement(), rwi.getWeight());
}
// check doubledom
@ -550,7 +647,7 @@ public final class RWIProcess extends Thread
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue<WordReferenceVars>(this.query.snippetCacheStrategy == null || this.query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation, false);
this.doubleDomCache.put(hosthash, m);
return rwi;
return this.query.getSegment().fulltext().getMetadata(rwi.getElement(), rwi.getWeight());
}
// second appearances of dom
m.put(rwi);
@ -577,27 +674,17 @@ public final class RWIProcess extends Thread
Log.logException(e);
continue; // not the best solution...
}
if ( m == null ) {
continue;
}
if ( m.isEmpty() ) {
continue;
}
if ( bestEntry == null ) {
if (m == null) continue;
if (m.isEmpty()) continue;
if (bestEntry == null) {
bestEntry = m.peek();
continue;
}
o = m.peek();
if ( o == null ) {
continue;
}
if ( o.getWeight() < bestEntry.getWeight() ) {
bestEntry = o;
}
}
if ( bestEntry == null ) {
return null;
if (o == null) continue;
if (o.getWeight() < bestEntry.getWeight()) bestEntry = o;
}
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
@ -611,7 +698,8 @@ public final class RWIProcess extends Thread
}
}
}
return bestEntry;
if (bestEntry == null) return null;
return this.query.getSegment().fulltext().getMetadata(bestEntry.getElement(), bestEntry.getWeight());
}
/**
@ -631,18 +719,8 @@ public final class RWIProcess extends Thread
long timeleft;
while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) {
//System.out.println("timeleft = " + timeleft);
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
final URIMetadataNode page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight());
if ( page == null ) {
try {
this.misses.putUnique(obrwi.getElement().urlhash());
} catch ( final SpaceExceededException e ) {
}
continue;
}
final URIMetadataNode page = takeRWI(skipDoubleDom, timeleft);
if (page == null) return null; // all time was already wasted in takeRWI to get another element
if ( !this.query.urlMask_isCatchall ) {
// check url mask
@ -650,13 +728,6 @@ public final class RWIProcess extends Thread
this.sortout++;
continue;
}
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
//if (query.sitehash == null) {
// this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
// this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
//}
}
// check for more errors
@ -666,16 +737,14 @@ public final class RWIProcess extends Thread
}
// check content domain
if ((this.query.contentdom.getCode() > 0 &&
page.url().getContentDomain() != this.query.contentdom) ||
(this.query.contentdom == Classification.ContentDomain.TEXT &&
page.url().getContentDomain().getCode() > 0)) {
if ((this.query.contentdom.getCode() > 0 && page.url().getContentDomain() != this.query.contentdom) ||
(this.query.contentdom == Classification.ContentDomain.TEXT && page.url().getContentDomain().getCode() > 0)) {
this.sortout++;
continue;
}
// Check for blacklist
if ( Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page) ) {
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page)) {
this.sortout++;
continue;
}
@ -691,10 +760,8 @@ public final class RWIProcess extends Thread
.equals("")) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null) {
if (!f.isListed(page.url(), null)) {
this.sortout++;
continue;
}

View File

@ -517,7 +517,7 @@ public class SnippetProcess {
// place the result to the result vector
// apply post-ranking
long ranking = Long.valueOf(SnippetProcess.this.rankingProcess.getOrder().cardinal(resultEntry.word()));
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(SnippetProcess.this.rankingProcess.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, SnippetProcess.this.rankingProcess.getTopicNavigator(10));
resultEntry.ranking = ranking;
SnippetProcess.this.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow

View File

@ -37,6 +37,7 @@ import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -262,4 +263,36 @@ public class ReferenceOrder {
return r; // the higher the number the better the ranking.
}
public long cardinal(final URIMetadataNode t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
assert t != null;
assert this.ranking != null;
final long r =
((256 - DigestURI.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength)
+ ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.hash()) << 4)) << this.ranking.coeff_ybr) : 0)
+ ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength)
+ (t.virtualAge() << this.ranking.coeff_date)
+ (t.wordsintitle()<< this.ranking.coeff_wordsintitle)
+ (t.wordCount() << this.ranking.coeff_wordsintext)
+ (t.llocal() << this.ranking.coeff_llocal)
+ (t.lother() << this.ranking.coeff_lother)
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0);
return r; // the higher the number the better the ranking.
}
}

View File

@ -192,6 +192,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
}
public WordReference word() {
final Reference word = this.urlentry.word();
if (word == null) return null;
if (word instanceof WordReferenceVars) return (WordReferenceVars) word;
if (word instanceof WordReferenceRow) return (WordReferenceRow) word;
assert word instanceof WordReferenceRow || word instanceof WordReferenceVars : word == null ? "word = null" : "type = " + word.getClass().getCanonicalName();