mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
removed hack which translated Solr documents to virtual RWI entries
which had been then mixed with remote RWIs. Now these Solr documents are feeded into the result set as they appear during local and remote search. That makes the search much faster.
This commit is contained in:
parent
6017691522
commit
e5b3c172ff
|
@ -34,7 +34,6 @@ import net.yacy.cora.document.RSSMessage;
|
|||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.crawler.data.ResultURLs.EventOrigin;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.peers.EventChannel;
|
||||
|
@ -87,7 +86,7 @@ public final class transferURL {
|
|||
int doublecheck = 0;
|
||||
// read the urls from the other properties and store
|
||||
String urls;
|
||||
URIMetadata lEntry;
|
||||
URIMetadataRow lEntry;
|
||||
for (int i = 0; i < urlc; i++) {
|
||||
serverCore.checkInterruption();
|
||||
|
||||
|
|
|
@ -183,6 +183,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
|||
|
||||
@Override
|
||||
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
|
||||
if (this.server == null) return;
|
||||
try {
|
||||
synchronized (this.server) {
|
||||
this.server.add(solrdoc, this.commitWithinMs);
|
||||
|
|
|
@ -97,17 +97,17 @@ public final class ResultURLs {
|
|||
}
|
||||
|
||||
public static void stack(
|
||||
final URIMetadata e,
|
||||
final URIMetadata urlEntry,
|
||||
final byte[] initiatorHash,
|
||||
final byte[] executorHash,
|
||||
final EventOrigin stackType) {
|
||||
// assert initiatorHash != null; // null == proxy !
|
||||
assert executorHash != null;
|
||||
if (e == null) { return; }
|
||||
if (urlEntry == null) { return; }
|
||||
try {
|
||||
final Map<String, InitExecEntry> resultStack = getStack(stackType);
|
||||
if (resultStack != null) {
|
||||
resultStack.put(ASCII.String(e.hash()), new InitExecEntry(initiatorHash, executorHash));
|
||||
resultStack.put(ASCII.String(urlEntry.hash()), new InitExecEntry(initiatorHash, executorHash));
|
||||
}
|
||||
} catch (final Exception ex) {
|
||||
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
|
||||
|
@ -116,7 +116,7 @@ public final class ResultURLs {
|
|||
try {
|
||||
final ScoreMap<String> domains = getDomains(stackType);
|
||||
if (domains != null) {
|
||||
domains.inc(e.url().getHost());
|
||||
domains.inc(urlEntry.url().getHost());
|
||||
}
|
||||
} catch (final Exception ex) {
|
||||
System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString());
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Date;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.date.MicroDate;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.federate.solr.SolrType;
|
||||
|
@ -123,7 +124,7 @@ public class URIMetadataNode implements URIMetadata {
|
|||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private ArrayList<String> getArrayList(YaCySchema field) {
|
||||
private ArrayList<String> getStringList(YaCySchema field) {
|
||||
assert field.isMultiValued();
|
||||
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
|
||||
Object r = this.doc.getFieldValue(field.name());
|
||||
|
@ -136,6 +137,20 @@ public class URIMetadataNode implements URIMetadata {
|
|||
return a;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private ArrayList<Integer> getIntList(YaCySchema field) {
|
||||
assert field.isMultiValued();
|
||||
assert field.getType() == SolrType.integer;
|
||||
Object r = this.doc.getFieldValue(field.name());
|
||||
if (r == null) return new ArrayList<Integer>(0);
|
||||
if (r instanceof ArrayList) {
|
||||
return (ArrayList<Integer>) r;
|
||||
}
|
||||
ArrayList<Integer> a = new ArrayList<Integer>(1);
|
||||
a.add((Integer) r);
|
||||
return a;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] hash() {
|
||||
return this.hash;
|
||||
|
@ -165,7 +180,7 @@ public class URIMetadataNode implements URIMetadata {
|
|||
|
||||
@Override
|
||||
public String dc_title() {
|
||||
ArrayList<String> a = getArrayList(YaCySchema.title);
|
||||
ArrayList<String> a = getStringList(YaCySchema.title);
|
||||
if (a == null || a.size() == 0) return "";
|
||||
return a.get(0);
|
||||
}
|
||||
|
@ -233,7 +248,7 @@ public class URIMetadataNode implements URIMetadata {
|
|||
|
||||
@Override
|
||||
public char doctype() {
|
||||
ArrayList<String> a = getArrayList(YaCySchema.content_type);
|
||||
ArrayList<String> a = getStringList(YaCySchema.content_type);
|
||||
if (a == null || a.size() == 0) return Response.docType(url());
|
||||
return Response.docType(a.get(0));
|
||||
}
|
||||
|
@ -248,7 +263,7 @@ public class URIMetadataNode implements URIMetadata {
|
|||
|
||||
@Override
|
||||
public byte[] referrerHash() {
|
||||
ArrayList<String> referrer = getArrayList(YaCySchema.referrer_id_txt);
|
||||
ArrayList<String> referrer = getStringList(YaCySchema.referrer_id_txt);
|
||||
if (referrer == null || referrer.size() == 0) return null;
|
||||
return ASCII.getBytes(referrer.get(0));
|
||||
}
|
||||
|
@ -319,6 +334,20 @@ public class URIMetadataNode implements URIMetadata {
|
|||
return this.appc;
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return MicroDate.microDateDays(moddate());
|
||||
}
|
||||
|
||||
public int wordsintitle() {
|
||||
ArrayList<Integer> x = getIntList(YaCySchema.title_words_val);
|
||||
if (x == null || x.size() == 0) return 0;
|
||||
return x.get(0).intValue();
|
||||
}
|
||||
|
||||
public int urllength() {
|
||||
return getInt(YaCySchema.url_chars_i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String snippet() {
|
||||
return this.snippet;
|
||||
|
@ -326,7 +355,7 @@ public class URIMetadataNode implements URIMetadata {
|
|||
|
||||
@Override
|
||||
public String[] collections() {
|
||||
ArrayList<String> a = getArrayList(YaCySchema.collection_sxt);
|
||||
ArrayList<String> a = getStringList(YaCySchema.collection_sxt);
|
||||
return a.toArray(new String[a.size()]);
|
||||
}
|
||||
|
||||
|
|
|
@ -76,7 +76,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
/**
|
||||
* object for termination of concurrent blocking queue processing
|
||||
*/
|
||||
public static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
|
||||
protected static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
|
||||
|
||||
// static properties
|
||||
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
||||
|
@ -114,7 +114,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
|
||||
private final Row.Entry entry;
|
||||
|
||||
public WordReferenceRow(
|
||||
protected WordReferenceRow(
|
||||
final byte[] urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
|
@ -206,13 +206,11 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
this.entry = urlEntryRow.newEntry(external, true);
|
||||
}
|
||||
|
||||
public WordReferenceRow(final byte[] row) {
|
||||
private WordReferenceRow(final byte[] row) {
|
||||
this.entry = urlEntryRow.newEntry(row);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public WordReferenceRow(final Row.Entry rentry) {
|
||||
protected WordReferenceRow(final Row.Entry rentry) {
|
||||
// no cloning is necessary since there is no further manipulation after this initial instantiation
|
||||
this.entry = rentry;
|
||||
}
|
||||
|
@ -249,10 +247,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
|
||||
}
|
||||
|
||||
public long freshUntil() {
|
||||
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hitcount() {
|
||||
return (0xff & this.entry.getColByte(col_hitcount));
|
||||
|
@ -263,11 +257,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
|
|||
return new ArrayList<Integer>(0);
|
||||
}
|
||||
|
||||
public int position(final int p) {
|
||||
assert p == 0 : "p = " + p;
|
||||
return (int) this.entry.getColLong(col_posintext);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int posinphrase() {
|
||||
return (0xff & this.entry.getColByte(col_posinphrase));
|
||||
|
|
|
@ -78,7 +78,6 @@ import net.yacy.cora.order.Digest;
|
|||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.crawler.data.ResultURLs.EventOrigin;
|
||||
|
@ -88,7 +87,6 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceFactory;
|
||||
import net.yacy.kelondro.data.word.WordReferenceVars;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.rwi.Reference;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
||||
|
@ -767,7 +765,7 @@ public final class Protocol
|
|||
|
||||
// insert results to containers
|
||||
int term = count;
|
||||
for ( final URIMetadata urlEntry : result.links ) {
|
||||
for ( final URIMetadataRow urlEntry : result.links ) {
|
||||
if ( term-- <= 0 ) {
|
||||
break; // do not process more that requested (in case that evil peers fill us up with rubbish)
|
||||
}
|
||||
|
@ -883,7 +881,7 @@ public final class Protocol
|
|||
public Map<byte[], Integer> indexcount; //
|
||||
public long searchtime; // time that the peer actually spent to create the result
|
||||
public String[] references; // search hints, the top-words
|
||||
public List<URIMetadata> links; // LURLs of search
|
||||
public List<URIMetadataRow> links; // LURLs of search
|
||||
public Map<byte[], String> indexabstract; // index abstracts, a collection of url-hashes per word
|
||||
|
||||
public SearchResult(
|
||||
|
@ -1003,14 +1001,14 @@ public final class Protocol
|
|||
}
|
||||
}
|
||||
this.references = resultMap.get("references").split(",");
|
||||
this.links = new ArrayList<URIMetadata>(this.urlcount);
|
||||
this.links = new ArrayList<URIMetadataRow>(this.urlcount);
|
||||
for ( int n = 0; n < this.urlcount; n++ ) {
|
||||
// get one single search result
|
||||
final String resultLine = resultMap.get("resource" + n);
|
||||
if ( resultLine == null ) {
|
||||
continue;
|
||||
}
|
||||
final URIMetadata urlEntry = URIMetadataRow.importEntry(resultLine);
|
||||
final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine);
|
||||
if ( urlEntry == null ) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1027,8 +1025,6 @@ public final class Protocol
|
|||
final Seed target,
|
||||
final Blacklist blacklist) {
|
||||
|
||||
final HandleSet wordhashes = event.getQuery().query_include_hashes;
|
||||
|
||||
if (event.getQuery().queryString == null || event.getQuery().queryString.length() == 0) {
|
||||
return -1; // we cannot query solr only with word hashes, there is no clear text string
|
||||
}
|
||||
|
@ -1064,14 +1060,9 @@ public final class Protocol
|
|||
}
|
||||
|
||||
// evaluate result
|
||||
List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
|
||||
if (docList.size() > 0) {// create containers
|
||||
Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;
|
||||
final List<ReferenceContainer<WordReference>> container = new ArrayList<ReferenceContainer<WordReference>>(wordhashes.size());
|
||||
for (byte[] hash: wordhashes) {
|
||||
try {
|
||||
container.add(ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, hash, count));
|
||||
} catch (SpaceExceededException e) {} // throws SpaceExceededException
|
||||
}
|
||||
|
||||
int term = count;
|
||||
for (final SolrDocument doc: docList) {
|
||||
|
@ -1122,27 +1113,17 @@ public final class Protocol
|
|||
}
|
||||
}
|
||||
|
||||
// we create virtual word references here which are necessary to feed search results into retrieval process
|
||||
Reference entry = new WordReferenceVars(urlEntry);
|
||||
|
||||
// add the url entry to the word indexes
|
||||
for ( final ReferenceContainer<WordReference> c : container ) {
|
||||
try {
|
||||
c.add(entry);
|
||||
} catch ( final SpaceExceededException e ) {
|
||||
Log.logException(e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
container.add(urlEntry);
|
||||
}
|
||||
|
||||
if (localsearch) {
|
||||
event.rankingProcess.add(container.get(0), true, "localpeer", docList.size(), time);
|
||||
event.rankingProcess.add(container, true, "localpeer", docList.size());
|
||||
event.rankingProcess.addFinalize();
|
||||
event.rankingProcess.addExpectedRemoteReferences(-count);
|
||||
Network.log.logInfo("local search (solr): localpeer sent " + container.get(0).size() + "/" + docList.size() + " references");
|
||||
} else {
|
||||
event.rankingProcess.add(container.get(0), false, target.getName() + "/" + target.hash, docList.size(), time);
|
||||
event.rankingProcess.add(container, false, target.getName() + "/" + target.hash, docList.size());
|
||||
event.rankingProcess.addFinalize();
|
||||
event.rankingProcess.addExpectedRemoteReferences(-count);
|
||||
Network.log.logInfo("remote search (solr): peer " + target.getName() + " sent " + container.get(0).size() + "/" + docList.size() + " references");
|
||||
|
|
|
@ -148,6 +148,7 @@ import net.yacy.kelondro.blob.Tables;
|
|||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
||||
|
@ -2588,8 +2589,7 @@ public final class Switchboard extends serverSwitch
|
|||
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
|
||||
|
||||
// STORE WORD INDEX
|
||||
URIMetadata newEntry = null;
|
||||
newEntry =
|
||||
URIMetadataRow newEntry =
|
||||
this.index.storeDocument(
|
||||
url,
|
||||
referrerURL,
|
||||
|
|
|
@ -56,7 +56,6 @@ import net.yacy.document.Parser;
|
|||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
@ -341,7 +340,7 @@ public class Segment {
|
|||
if (this.termIndex != null) this.termIndex.add(termHash, entry);
|
||||
}
|
||||
|
||||
public URIMetadata storeDocument(
|
||||
public URIMetadataRow storeDocument(
|
||||
final DigestURI url,
|
||||
final DigestURI referrerURL,
|
||||
Date modDate,
|
||||
|
@ -368,7 +367,7 @@ public class Segment {
|
|||
// STORE URL TO LOADED-URL-DB
|
||||
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
|
||||
char docType = Response.docType(document.dc_format());
|
||||
final URIMetadata metadata = new URIMetadataRow(
|
||||
final URIMetadataRow metadata = new URIMetadataRow(
|
||||
url, // URL
|
||||
dc_title, // document description
|
||||
document.dc_creator(), // author
|
||||
|
|
|
@ -30,6 +30,7 @@ import java.util.Comparator;
|
|||
import java.util.ConcurrentModificationException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
@ -67,6 +68,7 @@ import net.yacy.kelondro.index.RowHandleSet;
|
|||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
||||
import net.yacy.kelondro.rwi.TermSearch;
|
||||
import net.yacy.kelondro.util.Bitfield;
|
||||
import net.yacy.peers.graphics.ProfilingGraph;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.repository.FilterEngine;
|
||||
|
@ -97,9 +99,9 @@ public final class RWIProcess extends Thread
|
|||
private int remote_indexCount;
|
||||
private int remote_peerCount;
|
||||
private int local_indexCount;
|
||||
private final AtomicInteger maxExpectedRemoteReferences, expectedRemoteReferences,
|
||||
receivedRemoteReferences;
|
||||
private final AtomicInteger maxExpectedRemoteReferences, expectedRemoteReferences, receivedRemoteReferences;
|
||||
private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
|
||||
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack;
|
||||
private final AtomicInteger feedersAlive, feedersTerminated;
|
||||
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
||||
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
|
||||
|
@ -126,7 +128,9 @@ public final class RWIProcess extends Thread
|
|||
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
||||
this.addRunning = true;
|
||||
this.localSearchInclusion = null;
|
||||
this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(query.snippetCacheStrategy == null || query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation, false);
|
||||
int stackMaxsize = query.snippetCacheStrategy == null || query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation;
|
||||
this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(stackMaxsize, false);
|
||||
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(stackMaxsize, false);
|
||||
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
|
||||
this.query = query;
|
||||
this.order = order;
|
||||
|
@ -240,23 +244,16 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
|
||||
public void add(
|
||||
final ReferenceContainer<WordReference> index,
|
||||
final List<URIMetadataNode> index,
|
||||
final boolean local,
|
||||
final String resourceName,
|
||||
final int fullResource,
|
||||
final long maxtime) {
|
||||
// we collect the urlhashes and construct a list with urlEntry objects
|
||||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
//Log.logInfo("RWIProcess", "added a container, size = " + index.size());
|
||||
final int fullResource) {
|
||||
|
||||
this.addRunning = true;
|
||||
|
||||
assert (index != null);
|
||||
if ( index.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
if (index.isEmpty()) return;
|
||||
|
||||
if ( !local ) {
|
||||
if (!local) {
|
||||
assert fullResource >= 0 : "fullResource = " + fullResource;
|
||||
this.remote_resourceSize += fullResource;
|
||||
this.remote_peerCount++;
|
||||
|
@ -265,7 +262,6 @@ public final class RWIProcess extends Thread
|
|||
long timer = System.currentTimeMillis();
|
||||
|
||||
// normalize entries
|
||||
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime);
|
||||
int is = index.size();
|
||||
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
|
||||
this.query.id(true),
|
||||
|
@ -279,77 +275,37 @@ public final class RWIProcess extends Thread
|
|||
|
||||
// iterate over normalized entries and select some that are better than currently stored
|
||||
timer = System.currentTimeMillis();
|
||||
final boolean nav_hosts =
|
||||
this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
|
||||
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
|
||||
|
||||
// apply all constraints
|
||||
long timeout = System.currentTimeMillis() + maxtime;
|
||||
try {
|
||||
WordReferenceVars iEntry;
|
||||
final String pattern = this.query.urlMask.pattern();
|
||||
final boolean httpPattern = pattern.equals("http://.*");
|
||||
final boolean noHttpButProtocolPattern =
|
||||
pattern.equals("https://.*")
|
||||
|| pattern.equals("ftp://.*")
|
||||
|| pattern.equals("smb://.*")
|
||||
|| pattern.equals("file://.*");
|
||||
long remaining;
|
||||
pollloop: while ( true ) {
|
||||
remaining = timeout - System.currentTimeMillis();
|
||||
if (remaining <= 0) {
|
||||
Log.logWarning("RWIProcess", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
||||
break;
|
||||
}
|
||||
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
|
||||
if ( iEntry == null ) {
|
||||
Log.logWarning("RWIProcess", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
||||
break pollloop;
|
||||
}
|
||||
if ( iEntry == WordReferenceVars.poison ) {
|
||||
break pollloop;
|
||||
}
|
||||
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
|
||||
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
|
||||
final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
|
||||
pollloop: for (URIMetadataNode iEntry: index) {
|
||||
|
||||
// doublecheck for urls
|
||||
if (this.urlhashes.has(iEntry.urlhash())) {
|
||||
if (this.urlhashes.has(iEntry.hash())) {
|
||||
continue pollloop;
|
||||
}
|
||||
|
||||
// increase flag counts
|
||||
for ( int j = 0; j < 32; j++ ) {
|
||||
if ( iEntry.flags().get(j) ) {
|
||||
this.flagcount[j]++;
|
||||
}
|
||||
if (iEntry.flags().get(j)) this.flagcount[j]++;
|
||||
}
|
||||
|
||||
// check constraints
|
||||
if ( !testFlags(iEntry) ) {
|
||||
continue pollloop;
|
||||
}
|
||||
Bitfield flags = iEntry.flags();
|
||||
if (!testFlags(flags)) continue pollloop;
|
||||
|
||||
// check document domain
|
||||
if ( this.query.contentdom.getCode() > 0 ) {
|
||||
if ( (this.query.contentdom == ContentDomain.AUDIO)
|
||||
&& (!(iEntry.flags().get(Condenser.flag_cat_hasaudio))) ) {
|
||||
if (this.query.contentdom.getCode() > 0 &&
|
||||
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
|
||||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
|
||||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
|
||||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
|
||||
continue pollloop;
|
||||
}
|
||||
if ( (this.query.contentdom == ContentDomain.VIDEO)
|
||||
&& (!(iEntry.flags().get(Condenser.flag_cat_hasvideo))) ) {
|
||||
continue pollloop;
|
||||
}
|
||||
if ( (this.query.contentdom == ContentDomain.IMAGE)
|
||||
&& (!(iEntry.flags().get(Condenser.flag_cat_hasimage))) ) {
|
||||
continue pollloop;
|
||||
}
|
||||
if ( (this.query.contentdom == ContentDomain.APP)
|
||||
&& (!(iEntry.flags().get(Condenser.flag_cat_hasapp))) ) {
|
||||
continue pollloop;
|
||||
}
|
||||
}
|
||||
|
||||
// count domZones
|
||||
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
|
||||
|
||||
// check site constraints
|
||||
final String hosthash = iEntry.hosthash();
|
||||
|
@ -358,10 +314,172 @@ public final class RWIProcess extends Thread
|
|||
continue pollloop;
|
||||
}
|
||||
} else {
|
||||
if ( !hosthash.equals(this.query.sitehash) ) {
|
||||
// filter out all domains that do not match with the site constraint
|
||||
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
|
||||
}
|
||||
|
||||
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
|
||||
if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) {
|
||||
this.hostNavigator.inc(hosthash);
|
||||
this.hostResolver.put(hosthash, iEntry.hash());
|
||||
}
|
||||
|
||||
// check protocol
|
||||
if ( !this.query.urlMask_isCatchall ) {
|
||||
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.hash());
|
||||
if ( httpPattern && !httpFlagSet ) {
|
||||
continue pollloop;
|
||||
}
|
||||
if ( noHttpButProtocolPattern && httpFlagSet ) {
|
||||
continue pollloop;
|
||||
}
|
||||
}
|
||||
|
||||
// check vocabulary constraint
|
||||
String subject = YaCyMetadata.hashURI(iEntry.hash());
|
||||
Resource resource = JenaTripleStore.getResource(subject);
|
||||
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
|
||||
// all metatags must appear in the tags list
|
||||
for (Tagging.Metatag metatag: this.query.metatags) {
|
||||
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, metatag.getPredicate());
|
||||
if (!ni.hasNext()) continue pollloop;
|
||||
String tags = ni.next().toString();
|
||||
if (tags.indexOf(metatag.getObject()) < 0) continue pollloop;
|
||||
}
|
||||
}
|
||||
|
||||
// add navigators using the triplestore
|
||||
for (Map.Entry<String, String> v: this.taggingPredicates.entrySet()) {
|
||||
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, v.getValue());
|
||||
while (ni.hasNext()) {
|
||||
String[] tags = ni.next().toString().split(",");
|
||||
for (String tag: tags) {
|
||||
ScoreMap<String> voc = this.vocabularyNavigator.get(v.getKey());
|
||||
if (voc == null) {
|
||||
voc = new ConcurrentScoreMap<String>();
|
||||
this.vocabularyNavigator.put(v.getKey(), voc);
|
||||
}
|
||||
voc.inc(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finally extend the double-check and insert result to stack
|
||||
this.urlhashes.putUnique(iEntry.hash());
|
||||
rankingtryloop: while (true) {
|
||||
try {
|
||||
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
|
||||
break rankingtryloop;
|
||||
} catch ( final ArithmeticException e ) {
|
||||
// this may happen if the concurrent normalizer changes values during cardinal computation
|
||||
continue rankingtryloop;
|
||||
}
|
||||
}
|
||||
// increase counter for statistics
|
||||
if (local) this.local_indexCount++; else this.remote_indexCount++;
|
||||
}
|
||||
} catch ( final SpaceExceededException e ) {
|
||||
}
|
||||
|
||||
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
||||
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
|
||||
this.query.id(true),
|
||||
SearchEvent.Type.PRESORT,
|
||||
resourceName,
|
||||
index.size(),
|
||||
System.currentTimeMillis() - timer), false);
|
||||
}
|
||||
|
||||
public void add(
|
||||
final ReferenceContainer<WordReference> index,
|
||||
final boolean local,
|
||||
final String resourceName,
|
||||
final int fullResource,
|
||||
final long maxtime) {
|
||||
// we collect the urlhashes and construct a list with urlEntry objects
|
||||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
//Log.logInfo("RWIProcess", "added a container, size = " + index.size());
|
||||
|
||||
this.addRunning = true;
|
||||
assert (index != null);
|
||||
if (index.isEmpty()) return;
|
||||
if (!local) {
|
||||
assert fullResource >= 0 : "fullResource = " + fullResource;
|
||||
this.remote_resourceSize += fullResource;
|
||||
this.remote_peerCount++;
|
||||
}
|
||||
long timer = System.currentTimeMillis();
|
||||
|
||||
// normalize entries
|
||||
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime);
|
||||
int is = index.size();
|
||||
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
|
||||
this.query.id(true),
|
||||
SearchEvent.Type.NORMALIZING,
|
||||
resourceName,
|
||||
is,
|
||||
System.currentTimeMillis() - timer), false);
|
||||
if (!local) this.receivedRemoteReferences.addAndGet(is);
|
||||
|
||||
// iterate over normalized entries and select some that are better than currently stored
|
||||
timer = System.currentTimeMillis();
|
||||
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
|
||||
|
||||
// apply all constraints
|
||||
long timeout = System.currentTimeMillis() + maxtime;
|
||||
try {
|
||||
WordReferenceVars iEntry;
|
||||
final String pattern = this.query.urlMask.pattern();
|
||||
final boolean httpPattern = pattern.equals("http://.*");
|
||||
final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
|
||||
long remaining;
|
||||
pollloop: while ( true ) {
|
||||
remaining = timeout - System.currentTimeMillis();
|
||||
if (remaining <= 0) {
|
||||
Log.logWarning("RWIProcess", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
||||
break;
|
||||
}
|
||||
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
|
||||
if (iEntry == null) {
|
||||
Log.logWarning("RWIProcess", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
||||
break pollloop;
|
||||
}
|
||||
if (iEntry == WordReferenceVars.poison) {
|
||||
break pollloop;
|
||||
}
|
||||
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
|
||||
|
||||
// doublecheck for urls
|
||||
if (this.urlhashes.has(iEntry.urlhash())) continue pollloop;
|
||||
|
||||
// increase flag counts
|
||||
Bitfield flags = iEntry.flags();
|
||||
for (int j = 0; j < 32; j++) {
|
||||
if (flags.get(j)) this.flagcount[j]++;
|
||||
}
|
||||
|
||||
// check constraints
|
||||
if (!testFlags(flags)) continue pollloop;
|
||||
|
||||
// check document domain
|
||||
if (this.query.contentdom.getCode() > 0 &&
|
||||
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
|
||||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
|
||||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
|
||||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
|
||||
continue pollloop;
|
||||
}
|
||||
|
||||
// count domZones
|
||||
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
|
||||
|
||||
// check site constraints
|
||||
final String hosthash = iEntry.hosthash();
|
||||
if ( this.query.sitehash == null ) {
|
||||
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) continue pollloop;
|
||||
} else {
|
||||
// filter out all domains that do not match with the site constraint
|
||||
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
|
||||
}
|
||||
|
||||
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
|
||||
|
@ -371,14 +489,10 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
|
||||
// check protocol
|
||||
if ( !this.query.urlMask_isCatchall ) {
|
||||
if (!this.query.urlMask_isCatchall) {
|
||||
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
|
||||
if ( httpPattern && !httpFlagSet ) {
|
||||
continue pollloop;
|
||||
}
|
||||
if ( noHttpButProtocolPattern && httpFlagSet ) {
|
||||
continue pollloop;
|
||||
}
|
||||
if (httpPattern && !httpFlagSet) continue pollloop;
|
||||
if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
|
||||
}
|
||||
|
||||
// check vocabulary constraint
|
||||
|
@ -412,7 +526,7 @@ public final class RWIProcess extends Thread
|
|||
|
||||
// finally extend the double-check and insert result to stack
|
||||
this.urlhashes.putUnique(iEntry.urlhash());
|
||||
rankingtryloop: while ( true ) {
|
||||
rankingtryloop: while (true) {
|
||||
try {
|
||||
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
|
||||
break rankingtryloop;
|
||||
|
@ -422,12 +536,7 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
}
|
||||
// increase counter for statistics
|
||||
if ( local ) {
|
||||
this.local_indexCount++;
|
||||
} else {
|
||||
this.remote_indexCount++;
|
||||
//}
|
||||
}
|
||||
if (local) this.local_indexCount++; else this.remote_indexCount++;
|
||||
}
|
||||
if (System.currentTimeMillis() >= timeout) Log.logWarning("RWIProcess", "rwi normalization ended with timeout = " + maxtime);
|
||||
|
||||
|
@ -464,25 +573,19 @@ public final class RWIProcess extends Thread
|
|||
//(!this.remote || this.remote_indexCount > 0);
|
||||
}
|
||||
|
||||
private boolean testFlags(final WordReference ientry) {
|
||||
if ( this.query.constraint == null ) {
|
||||
return true;
|
||||
}
|
||||
private boolean testFlags(final Bitfield flags) {
|
||||
if (this.query.constraint == null) return true;
|
||||
// test if ientry matches with filter
|
||||
// if all = true: let only entries pass that has all matching bits
|
||||
// if all = false: let all entries pass that has at least one matching bit
|
||||
if ( this.query.allofconstraint ) {
|
||||
if (this.query.allofconstraint) {
|
||||
for ( int i = 0; i < 32; i++ ) {
|
||||
if ( (this.query.constraint.get(i)) && (!ientry.flags().get(i)) ) {
|
||||
return false;
|
||||
}
|
||||
if ((this.query.constraint.get(i)) && (!flags.get(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
for ( int i = 0; i < 32; i++ ) {
|
||||
if ( (this.query.constraint.get(i)) && (ientry.flags().get(i)) ) {
|
||||
return true;
|
||||
}
|
||||
for (int i = 0; i < 32; i++) {
|
||||
if ((this.query.constraint.get(i)) && (flags.get(i))) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -493,17 +596,15 @@ public final class RWIProcess extends Thread
|
|||
return this.localSearchInclusion;
|
||||
}
|
||||
|
||||
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(
|
||||
final boolean skipDoubleDom,
|
||||
final long waitingtime) {
|
||||
private URIMetadataNode takeRWI(final boolean skipDoubleDom, final long waitingtime) {
|
||||
|
||||
// returns from the current RWI list the best entry and removes this entry from the list
|
||||
WeakPriorityBlockingQueue<WordReferenceVars> m;
|
||||
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
|
||||
WeakPriorityBlockingQueue.Element<URIMetadataNode> page;
|
||||
|
||||
// take one entry from the stack if there are entries on that stack or the feeding is not yet finished
|
||||
try {
|
||||
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
|
||||
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
|
||||
// wait some time if we did not get so much remote results so far to get a better ranking over remote results
|
||||
// we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results
|
||||
|
@ -514,30 +615,26 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
// loop as long as we can expect that we should get more results
|
||||
final long timeout = System.currentTimeMillis() + waitingtime;
|
||||
while ( ((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
|
||||
(this.query.itemsPerPage < 1 ||
|
||||
loops++ < this.query.itemsPerPage ||
|
||||
(loops > 1000 && !this.doubleDomCache.isEmpty())) ) {
|
||||
while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
|
||||
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage || (loops > 1000 && !this.doubleDomCache.isEmpty()))) {
|
||||
page = null;
|
||||
rwi = null;
|
||||
if ( waitingtime <= 0 ) {
|
||||
rwi = this.addRunning ? this.stack.poll(waitingtime) : this.stack.poll();
|
||||
page = this.addRunning ? this.nodeStack.poll(waitingtime) : this.nodeStack.poll();
|
||||
if (page == null) rwi = this.addRunning ? this.stack.poll(waitingtime) : this.stack.poll();
|
||||
} else {
|
||||
timeoutloop: while ( System.currentTimeMillis() < timeout ) {
|
||||
if ( feedingIsFinished() && this.stack.sizeQueue() == 0 ) {
|
||||
break timeoutloop;
|
||||
}
|
||||
if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
|
||||
page = this.nodeStack.poll(50);
|
||||
if (page != null) break timeoutloop;
|
||||
rwi = this.stack.poll(50);
|
||||
if ( rwi != null ) {
|
||||
break timeoutloop;
|
||||
if (rwi != null) break timeoutloop;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( rwi == null ) {
|
||||
//Log.logWarning("RWIProcess", "terminated takeRWI with rwi == null");
|
||||
break;
|
||||
}
|
||||
if ( !skipDoubleDom ) {
|
||||
//System.out.println("!skipDoubleDom");
|
||||
return rwi;
|
||||
if (page != null) return page.getElement();
|
||||
if (rwi == null) break;
|
||||
if (!skipDoubleDom) {
|
||||
return this.query.getSegment().fulltext().getMetadata(rwi.getElement(), rwi.getWeight());
|
||||
}
|
||||
|
||||
// check doubledom
|
||||
|
@ -550,7 +647,7 @@ public final class RWIProcess extends Thread
|
|||
// first appearance of dom. we create an entry to signal that one of that domain was already returned
|
||||
m = new WeakPriorityBlockingQueue<WordReferenceVars>(this.query.snippetCacheStrategy == null || this.query.snippetCacheStrategy == CacheStrategy.CACHEONLY ? max_results_preparation_special : max_results_preparation, false);
|
||||
this.doubleDomCache.put(hosthash, m);
|
||||
return rwi;
|
||||
return this.query.getSegment().fulltext().getMetadata(rwi.getElement(), rwi.getWeight());
|
||||
}
|
||||
// second appearances of dom
|
||||
m.put(rwi);
|
||||
|
@ -577,27 +674,17 @@ public final class RWIProcess extends Thread
|
|||
Log.logException(e);
|
||||
continue; // not the best solution...
|
||||
}
|
||||
if ( m == null ) {
|
||||
continue;
|
||||
}
|
||||
if ( m.isEmpty() ) {
|
||||
continue;
|
||||
}
|
||||
if ( bestEntry == null ) {
|
||||
if (m == null) continue;
|
||||
if (m.isEmpty()) continue;
|
||||
if (bestEntry == null) {
|
||||
bestEntry = m.peek();
|
||||
continue;
|
||||
}
|
||||
o = m.peek();
|
||||
if ( o == null ) {
|
||||
continue;
|
||||
}
|
||||
if ( o.getWeight() < bestEntry.getWeight() ) {
|
||||
bestEntry = o;
|
||||
}
|
||||
}
|
||||
if ( bestEntry == null ) {
|
||||
return null;
|
||||
if (o == null) continue;
|
||||
if (o.getWeight() < bestEntry.getWeight()) bestEntry = o;
|
||||
}
|
||||
if (bestEntry == null) return null;
|
||||
|
||||
// finally remove the best entry from the doubledom cache
|
||||
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
|
||||
|
@ -611,7 +698,8 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
}
|
||||
}
|
||||
return bestEntry;
|
||||
if (bestEntry == null) return null;
|
||||
return this.query.getSegment().fulltext().getMetadata(bestEntry.getElement(), bestEntry.getWeight());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -631,18 +719,8 @@ public final class RWIProcess extends Thread
|
|||
long timeleft;
|
||||
while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) {
|
||||
//System.out.println("timeleft = " + timeleft);
|
||||
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
|
||||
if ( obrwi == null ) {
|
||||
return null; // all time was already wasted in takeRWI to get another element
|
||||
}
|
||||
final URIMetadataNode page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight());
|
||||
if ( page == null ) {
|
||||
try {
|
||||
this.misses.putUnique(obrwi.getElement().urlhash());
|
||||
} catch ( final SpaceExceededException e ) {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
final URIMetadataNode page = takeRWI(skipDoubleDom, timeleft);
|
||||
if (page == null) return null; // all time was already wasted in takeRWI to get another element
|
||||
|
||||
if ( !this.query.urlMask_isCatchall ) {
|
||||
// check url mask
|
||||
|
@ -650,13 +728,6 @@ public final class RWIProcess extends Thread
|
|||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// in case that we do not have e catchall filter for urls
|
||||
// we must also construct the domain navigator here
|
||||
//if (query.sitehash == null) {
|
||||
// this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
|
||||
// this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
|
||||
//}
|
||||
}
|
||||
|
||||
// check for more errors
|
||||
|
@ -666,16 +737,14 @@ public final class RWIProcess extends Thread
|
|||
}
|
||||
|
||||
// check content domain
|
||||
if ((this.query.contentdom.getCode() > 0 &&
|
||||
page.url().getContentDomain() != this.query.contentdom) ||
|
||||
(this.query.contentdom == Classification.ContentDomain.TEXT &&
|
||||
page.url().getContentDomain().getCode() > 0)) {
|
||||
if ((this.query.contentdom.getCode() > 0 && page.url().getContentDomain() != this.query.contentdom) ||
|
||||
(this.query.contentdom == Classification.ContentDomain.TEXT && page.url().getContentDomain().getCode() > 0)) {
|
||||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for blacklist
|
||||
if ( Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page) ) {
|
||||
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page)) {
|
||||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
@ -691,10 +760,8 @@ public final class RWIProcess extends Thread
|
|||
.equals("")) {
|
||||
|
||||
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
|
||||
|
||||
if (f != null) {
|
||||
if (!f.isListed(page.url(), null)) {
|
||||
|
||||
this.sortout++;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -517,7 +517,7 @@ public class SnippetProcess {
|
|||
|
||||
// place the result to the result vector
|
||||
// apply post-ranking
|
||||
long ranking = Long.valueOf(SnippetProcess.this.rankingProcess.getOrder().cardinal(resultEntry.word()));
|
||||
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(SnippetProcess.this.rankingProcess.getOrder().cardinal(resultEntry.word()));
|
||||
ranking += postRanking(resultEntry, SnippetProcess.this.rankingProcess.getTopicNavigator(10));
|
||||
resultEntry.ranking = ranking;
|
||||
SnippetProcess.this.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
|
||||
|
|
|
@ -37,6 +37,7 @@ import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.LargeNumberCache;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
import net.yacy.kelondro.data.word.WordReferenceVars;
|
||||
|
@ -262,4 +263,36 @@ public class ReferenceOrder {
|
|||
return r; // the higher the number the better the ranking.
|
||||
}
|
||||
|
||||
public long cardinal(final URIMetadataNode t) {
|
||||
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
|
||||
// the normalizedEntry must be a normalized indexEntry
|
||||
final Bitfield flags = t.flags();
|
||||
assert t != null;
|
||||
assert this.ranking != null;
|
||||
final long r =
|
||||
((256 - DigestURI.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength)
|
||||
+ ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.hash()) << 4)) << this.ranking.coeff_ybr) : 0)
|
||||
+ ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength)
|
||||
+ (t.virtualAge() << this.ranking.coeff_date)
|
||||
+ (t.wordsintitle()<< this.ranking.coeff_wordsintitle)
|
||||
+ (t.wordCount() << this.ranking.coeff_wordsintext)
|
||||
+ (t.llocal() << this.ranking.coeff_llocal)
|
||||
+ (t.lother() << this.ranking.coeff_lother)
|
||||
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
|
||||
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
|
||||
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
|
||||
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0)
|
||||
+ ((DigestURI.probablyRootURL(t.hash())) ? 15 << this.ranking.coeff_urllength : 0);
|
||||
return r; // the higher the number the better the ranking.
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -192,6 +192,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
|
|||
}
|
||||
public WordReference word() {
|
||||
final Reference word = this.urlentry.word();
|
||||
if (word == null) return null;
|
||||
if (word instanceof WordReferenceVars) return (WordReferenceVars) word;
|
||||
if (word instanceof WordReferenceRow) return (WordReferenceRow) word;
|
||||
assert word instanceof WordReferenceRow || word instanceof WordReferenceVars : word == null ? "word = null" : "type = " + word.getClass().getCanonicalName();
|
||||
|
|
Loading…
Reference in New Issue
Block a user