mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
97e84439fb
- since specific heuristic Twitter & Blekko is not longer available or redundant with OpenSearchHeuristic, adjusted ConfigHeuristic to use OpensearchHeuristic settings only. For this the default OSD search target list is made available (copied) by default and the other configs are removed. - the return of QueryGoal.getOriginalQueryString includes the queryModifier, which are held separately in a modifier object, but in most (all) cases just the query term is expected, clarified and renamed it to QueryGoal.getQueryString which returns just the search term (if needed a .getOrigianlQueryString could be implemented in Queryparameters, adding the modifiers) - started to adjust internal html href references from absolute to relative (currently it is mixed). For future development we should prefer relative href targets (less trouble with context aware servlets)
1660 lines
83 KiB
Java
1660 lines
83 KiB
Java
// SearchEvent.java
|
|
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 10.10.2005 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.net.MalformedURLException;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.ConcurrentModificationException;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.SortedMap;
|
|
import java.util.TreeMap;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.TimeUnit;
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
import java.util.regex.Pattern;
|
|
|
|
import net.yacy.contentcontrol.ContentControlFilterUpdateThread;
|
|
import net.yacy.cora.document.analysis.Classification;
|
|
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.federate.yacy.Distribution;
|
|
import net.yacy.cora.lod.vocabulary.Tagging;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.protocol.Scanner;
|
|
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|
import net.yacy.cora.sorting.ReversibleScoreMap;
|
|
import net.yacy.cora.sorting.ScoreMap;
|
|
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
|
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
|
|
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.data.WorkTables;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.document.LargeNumberCache;
|
|
import net.yacy.document.LibraryProvider;
|
|
import net.yacy.document.TextParser;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
import net.yacy.kelondro.data.word.WordReference;
|
|
import net.yacy.kelondro.data.word.WordReferenceFactory;
|
|
import net.yacy.kelondro.data.word.WordReferenceVars;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
import net.yacy.kelondro.rwi.TermSearch;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.kelondro.util.SetTools;
|
|
import net.yacy.peers.RemoteSearch;
|
|
import net.yacy.peers.SeedDB;
|
|
import net.yacy.peers.graphics.ProfilingGraph;
|
|
import net.yacy.repository.FilterEngine;
|
|
import net.yacy.repository.LoaderDispatcher;
|
|
import net.yacy.repository.Blacklist.BlacklistType;
|
|
import net.yacy.search.EventTracker;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.search.SwitchboardConstants;
|
|
import net.yacy.search.index.Segment;
|
|
import net.yacy.search.ranking.ReferenceOrder;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
import net.yacy.search.snippet.ResultEntry;
|
|
import net.yacy.search.snippet.TextSnippet;
|
|
import net.yacy.search.snippet.TextSnippet.ResultClass;
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
public final class SearchEvent {
|
|
|
|
private static final int max_results_rwi = 3000;
|
|
private static final int max_results_node = 150;
|
|
|
|
/*
|
|
private static long noRobinsonLocalRWISearch = 0;
|
|
static {
|
|
try {
|
|
noRobinsonLocalRWISearch = GenericFormatter.FORMAT_SHORT_DAY.parse("20121107").getTime();
|
|
} catch (final ParseException e) {
|
|
}
|
|
}
|
|
*/
|
|
|
|
public final static ConcurrentLog log = new ConcurrentLog("SEARCH");
|
|
|
|
public static final int SNIPPET_MAX_LENGTH = 220;
|
|
private static final int MAX_TOPWORDS = 12; // default count of words for topicnavigagtor
|
|
|
|
private long eventTime;
|
|
public QueryParams query;
|
|
public final SeedDB peers;
|
|
final WorkTables workTables;
|
|
public final SecondarySearchSuperviser secondarySearchSuperviser;
|
|
public final List<RemoteSearch> primarySearchThreadsL;
|
|
public final List<Thread> nodeSearchThreads;
|
|
public Thread[] secondarySearchThreads;
|
|
public final SortedMap<byte[], String> preselectedPeerHashes;
|
|
private final Thread localSearchThread;
|
|
private final SortedMap<byte[], Integer> IACount;
|
|
private final SortedMap<byte[], String> IAResults;
|
|
private final SortedMap<byte[], HeuristicResult> heuristics;
|
|
private byte[] IAmaxcounthash, IAneardhthash;
|
|
public Thread rwiProcess;
|
|
private Thread localsolrsearch;
|
|
private int localsolroffset;
|
|
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
|
|
public final ScoreMap<String> locationNavigator; // a counter for the appearance of location coordinates
|
|
public final ScoreMap<String> hostNavigator; // a counter for the appearance of host names
|
|
public final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
|
|
public final ScoreMap<String> namespaceNavigator; // a counter for name spaces
|
|
public final ScoreMap<String> protocolNavigator; // a counter for protocol types
|
|
public final ScoreMap<String> filetypeNavigator; // a counter for file types
|
|
public final Map<String, ScoreMap<String>> vocabularyNavigator; // counters for Vocabularies; key is metatag.getVocabularyName()
|
|
private final int topicNavigatorCount; // if 0 no topicNavigator, holds expected number of terms for the topicNavigator
|
|
private final LoaderDispatcher loader;
|
|
private final HandleSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
|
|
private final boolean deleteIfSnippetFail;
|
|
private long urlRetrievalAllTime;
|
|
private long snippetComputationAllTime;
|
|
private ConcurrentHashMap<String, String> snippets;
|
|
private final boolean remote;
|
|
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
|
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
|
|
private final long maxtime;
|
|
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
|
|
private final int[] flagcount; // flag counter
|
|
private final AtomicInteger feedersAlive, feedersTerminated, snippetFetchAlive;
|
|
private boolean addRunning;
|
|
private final AtomicInteger receivedRemoteReferences;
|
|
private final ReferenceOrder order;
|
|
private final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
|
|
private final Map<String, String> taggingPredicates; // a map from tagging vocabulary names to tagging predicate uris
|
|
private final WeakPriorityBlockingQueue<WordReferenceVars> rwiStack; // thats the bag where the RWI search process writes to
|
|
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack; // thats the bag where the solr results are written to
|
|
private final WeakPriorityBlockingQueue<ResultEntry> resultList; // thats the result list where the actual search result is waiting to be displayed
|
|
private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source.
|
|
public final boolean excludeintext_image;
|
|
|
|
// the following values are filled during the search process as statistics for the search
|
|
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
|
|
public final AtomicInteger local_rwi_stored; // the number of existing hits by the local search in rwi index
|
|
public final AtomicInteger remote_rwi_available; // the number of hits imported from remote peers (rwi/solr mixed)
|
|
public final AtomicInteger remote_rwi_stored; // the number of existing hits at remote site
|
|
public final AtomicInteger remote_rwi_peerCount; // the number of peers which contributed to the remote search result
|
|
public final AtomicInteger local_solr_available; // the number of hits generated/ranked by the local search in solr
|
|
public final AtomicInteger local_solr_stored; // the number of existing hits by the local search in solr
|
|
public final AtomicInteger remote_solr_available;// the number of hits imported from remote peers (rwi/solr mixed)
|
|
public final AtomicInteger remote_solr_stored; // the number of existing hits at remote site
|
|
public final AtomicInteger remote_solr_peerCount;// the number of peers which contributed to the remote search result
|
|
|
|
public int getResultCount() {
|
|
return this.local_rwi_available.get() + this.remote_rwi_available.get() +
|
|
this.remote_solr_available.get() + this.local_solr_stored.get();
|
|
}
|
|
|
|
protected SearchEvent(
|
|
final QueryParams query,
|
|
final SeedDB peers,
|
|
final WorkTables workTables,
|
|
final SortedMap<byte[], String> preselectedPeerHashes,
|
|
final boolean generateAbstracts,
|
|
final LoaderDispatcher loader,
|
|
final int remote_maxcount,
|
|
final long remote_maxtime,
|
|
final boolean deleteIfSnippetFail) {
|
|
|
|
long ab = MemoryControl.available();
|
|
if (ab < 1024 * 1024 * 200) {
|
|
int eb = SearchEventCache.size();
|
|
SearchEventCache.cleanupEvents(false);
|
|
int en = SearchEventCache.size();
|
|
if (en < eb) {
|
|
log.info("Cleaned up search event cache (1) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed");
|
|
}
|
|
}
|
|
ab = MemoryControl.available();
|
|
int eb = SearchEventCache.size();
|
|
SearchEventCache.cleanupEvents(Math.max(1, (int) (MemoryControl.available() / (1024 * 1024 * 120))));
|
|
int en = SearchEventCache.size();
|
|
if (en < eb) {
|
|
log.info("Cleaned up search event cache (2) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed");
|
|
}
|
|
|
|
this.eventTime = System.currentTimeMillis(); // for lifetime check
|
|
this.peers = peers;
|
|
this.workTables = workTables;
|
|
this.query = query;
|
|
this.loader = loader;
|
|
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(max_results_node, false);
|
|
this.maxExpectedRemoteReferences = new AtomicInteger(0);
|
|
this.expectedRemoteReferences = new AtomicInteger(0);
|
|
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
|
|
// prepare configured search navigation
|
|
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
|
|
this.locationNavigator = navcfg.contains("location") ? new ConcurrentScoreMap<String>() : null;
|
|
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null;
|
|
this.namespaceNavigator = navcfg.contains("namespace") ? new ConcurrentScoreMap<String>() : null;
|
|
this.hostNavigator = navcfg.contains("hosts") ? new ConcurrentScoreMap<String>() : null;
|
|
this.protocolNavigator = navcfg.contains("protocol") ? new ConcurrentScoreMap<String>() : null;
|
|
this.filetypeNavigator = navcfg.contains("filetype") ? new ConcurrentScoreMap<String>() : null;
|
|
this.topicNavigatorCount = navcfg.contains("topics") ? MAX_TOPWORDS : 0;
|
|
this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
|
|
this.snippets = new ConcurrentHashMap<String, String>();
|
|
this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
|
|
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
|
|
this.secondarySearchThreads = null;
|
|
this.preselectedPeerHashes = preselectedPeerHashes;
|
|
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
|
|
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
|
|
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
|
|
this.IAmaxcounthash = null;
|
|
this.IAneardhthash = null;
|
|
this.localSearchThread = null;
|
|
this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
|
|
this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering
|
|
this.local_rwi_stored = new AtomicInteger(0);
|
|
this.local_solr_available = new AtomicInteger(0);
|
|
this.local_solr_stored = new AtomicInteger(0);
|
|
this.remote_rwi_stored = new AtomicInteger(0);
|
|
this.remote_rwi_available = new AtomicInteger(0); // the number of result contributions from all the remote dht peers
|
|
this.remote_rwi_peerCount = new AtomicInteger(0); // the number of remote dht peers that have contributed
|
|
this.remote_solr_stored = new AtomicInteger(0);
|
|
this.remote_solr_available= new AtomicInteger(0); // the number of result contributions from all the remote solr peers
|
|
this.remote_solr_peerCount= new AtomicInteger(0); // the number of remote solr peers that have contributed
|
|
final long start = System.currentTimeMillis();
|
|
|
|
// do a soft commit for fresh results
|
|
//query.getSegment().fulltext().commit(true);
|
|
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
// sortorder: 0 = hash, 1 = url, 2 = ranking
|
|
this.localSearchInclusion = null;
|
|
this.ref = new ConcurrentScoreMap<String>();
|
|
this.maxtime = query.maxtime;
|
|
this.rwiStack = new WeakPriorityBlockingQueue<WordReferenceVars>(max_results_rwi, false);
|
|
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
|
|
this.flagcount = new int[32];
|
|
for ( int i = 0; i < 32; i++ ) {
|
|
this.flagcount[i] = 0;
|
|
}
|
|
this.feedersAlive = new AtomicInteger(0);
|
|
this.feedersTerminated = new AtomicInteger(0);
|
|
this.snippetFetchAlive = new AtomicInteger(0);
|
|
this.addRunning = true;
|
|
this.receivedRemoteReferences = new AtomicInteger(0);
|
|
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
|
|
this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
|
|
this.taggingPredicates = new HashMap<String, String>();
|
|
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
|
|
this.taggingPredicates.put(t.getName(), t.getPredicate());
|
|
}
|
|
|
|
// start a local solr search
|
|
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
|
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true, this.excludeintext_image), 0, this.query.itemsPerPage, null /*this peer*/, 0, Switchboard.urlBlacklist);
|
|
}
|
|
this.localsolroffset = this.query.itemsPerPage;
|
|
|
|
// start a local RWI search concurrently
|
|
this.rwiProcess = null;
|
|
if (query.getSegment().connectedRWI() && !Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_DHT_OFF, false)) {
|
|
// we start the local search only if this peer is doing a remote search or when it is doing a local search and the peer is old
|
|
rwiProcess = new RWIProcess();
|
|
rwiProcess.start();
|
|
}
|
|
|
|
if (this.remote) {
|
|
// start global searches
|
|
this.pollImmediately = false;
|
|
final long timer = System.currentTimeMillis();
|
|
if (this.query.getQueryGoal().getIncludeHashes().isEmpty()) {
|
|
this.primarySearchThreadsL = null;
|
|
this.nodeSearchThreads = null;
|
|
} else {
|
|
this.primarySearchThreadsL = new ArrayList<RemoteSearch>();
|
|
this.nodeSearchThreads = new ArrayList<Thread>();
|
|
// start this concurrently because the remote search needs an enumeration
|
|
// of the remote peers which may block in some cases when i.e. DHT is active
|
|
// at the same time.
|
|
new Thread() {
|
|
@Override
|
|
public void run() {
|
|
Thread.currentThread().setName("SearchEvent.primaryRemoteSearches");
|
|
RemoteSearch.primaryRemoteSearches(
|
|
SearchEvent.this,
|
|
0, remote_maxcount,
|
|
remote_maxtime,
|
|
Switchboard.urlBlacklist,
|
|
(SearchEvent.this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes);
|
|
}
|
|
}.start();
|
|
}
|
|
if ( this.primarySearchThreadsL != null ) {
|
|
ConcurrentLog.fine("SEARCH_EVENT", "STARTING "
|
|
+ this.primarySearchThreadsL.size()
|
|
+ " THREADS TO CATCH EACH "
|
|
+ remote_maxcount
|
|
+ " URLs");
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.REMOTESEARCH_START, "", this.primarySearchThreadsL.size(), System.currentTimeMillis() - timer), false);
|
|
// finished searching
|
|
ConcurrentLog.fine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreadsL.size() + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
|
|
} else {
|
|
// no search since query is empty, user might have entered no data or filters have removed all search words
|
|
ConcurrentLog.fine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST.");
|
|
}
|
|
} else {
|
|
this.primarySearchThreadsL = null;
|
|
this.nodeSearchThreads = null;
|
|
this.pollImmediately = !query.getSegment().connectedRWI() || !Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false);
|
|
if ( generateAbstracts ) {
|
|
// we need the results now
|
|
try {
|
|
if (rwiProcess != null && query.getSegment().connectedRWI()) rwiProcess.join();
|
|
} catch (final Throwable e ) {
|
|
}
|
|
// compute index abstracts
|
|
final long timer = System.currentTimeMillis();
|
|
int maxcount = -1;
|
|
long mindhtdistance = Long.MAX_VALUE, l;
|
|
byte[] wordhash;
|
|
assert !query.getSegment().connectedRWI() || this.searchContainerMap() != null;
|
|
if (this.searchContainerMap() != null) {
|
|
for (final Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.searchContainerMap().entrySet()) {
|
|
wordhash = entry.getKey();
|
|
final ReferenceContainer<WordReference> container = entry.getValue();
|
|
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + ASCII.String(container.getTermHash()) + ", wordhash = " + ASCII.String(wordhash);
|
|
if ( container.size() > maxcount ) {
|
|
this.IAmaxcounthash = wordhash;
|
|
maxcount = container.size();
|
|
}
|
|
l = Distribution.horizontalDHTDistance(wordhash, ASCII.getBytes(peers.mySeed().hash));
|
|
if ( l < mindhtdistance ) {
|
|
// calculate the word hash that is closest to our dht position
|
|
mindhtdistance = l;
|
|
this.IAneardhthash = wordhash;
|
|
}
|
|
this.IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
|
|
this.IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString());
|
|
}
|
|
}
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ABSTRACTS, "", this.searchContainerMap() == null ? 0 : this.searchContainerMap().size(), System.currentTimeMillis() - timer), false);
|
|
} else {
|
|
// give process time to accumulate a certain amount of data
|
|
// before a reading process wants to get results from it
|
|
try {
|
|
if (rwiProcess != null && query.getSegment().connectedRWI()) rwiProcess.join(100);
|
|
} catch (final Throwable e ) {
|
|
}
|
|
// this will reduce the maximum waiting time until results are available to 100 milliseconds
|
|
// while we always get a good set of ranked data
|
|
}
|
|
}
|
|
|
|
// start worker threads to fetch urls and snippets
|
|
this.deleteIfSnippetFail = deleteIfSnippetFail;
|
|
this.urlRetrievalAllTime = 0;
|
|
this.snippetComputationAllTime = 0;
|
|
this.resultList = new WeakPriorityBlockingQueue<ResultEntry>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
|
|
|
|
// snippets do not need to match with the complete query hashes,
|
|
// only with the query minus the stopwords which had not been used for the search
|
|
boolean filtered = false;
|
|
// check if query contains stopword
|
|
if (Switchboard.stopwordHashes != null) {
|
|
Iterator<byte[]> it = query.getQueryGoal().getIncludeHashes().iterator();
|
|
while (it.hasNext()) {
|
|
if (Switchboard.stopwordHashes.contains((it.next()))) {
|
|
filtered = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone();
|
|
if (filtered) { // remove stopwords
|
|
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
|
|
}
|
|
|
|
// clean up events
|
|
SearchEventCache.cleanupEvents(false);
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.CLEANUP, "", 0, 0), false);
|
|
|
|
// store this search to a cache so it can be re-used
|
|
if ( MemoryControl.available() < 1024 * 1024 * 100 ) {
|
|
SearchEventCache.cleanupEvents(false);
|
|
}
|
|
SearchEventCache.put(this.query.id(false), this);
|
|
}
|
|
|
|
private class RWIProcess extends Thread {
|
|
|
|
public RWIProcess() {
|
|
super();
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
|
|
if (query.getSegment().termIndex() == null) return; // nothing to do; this index is not used
|
|
|
|
// do a search
|
|
oneFeederStarted();
|
|
|
|
// sort the local containers and truncate it to a limited count,
|
|
// so following sortings together with the global results will be fast
|
|
try {
|
|
final long timer = System.currentTimeMillis();
|
|
final TermSearch<WordReference> search =
|
|
SearchEvent.this.query
|
|
.getSegment()
|
|
.termIndex()
|
|
.query(
|
|
SearchEvent.this.query.getQueryGoal().getIncludeHashes(),
|
|
SearchEvent.this.query.getQueryGoal().getExcludeHashes(),
|
|
null,
|
|
Segment.wordReferenceFactory,
|
|
SearchEvent.this.query.maxDistance);
|
|
SearchEvent.this.localSearchInclusion = search.inclusion();
|
|
final ReferenceContainer<WordReference> index = search.joined();
|
|
EventTracker.update(
|
|
EventTracker.EClass.SEARCH,
|
|
new ProfilingGraph.EventSearch(
|
|
SearchEvent.this.query.id(true),
|
|
SearchEventType.JOIN,
|
|
SearchEvent.this.query.getQueryGoal().getQueryString(false),
|
|
index.size(),
|
|
System.currentTimeMillis() - timer),
|
|
false);
|
|
if ( !index.isEmpty() ) {
|
|
addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
|
|
SearchEvent.this.addFinalize();
|
|
}
|
|
} catch (final Exception e ) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
oneFeederTerminated();
|
|
}
|
|
}
|
|
}
|
|
|
|
public void addRWIs(
|
|
final ReferenceContainer<WordReference> index,
|
|
final boolean local,
|
|
final String resourceName,
|
|
final int fullResource,
|
|
final long maxtime) {
|
|
// we collect the urlhashes and construct a list with urlEntry objects
|
|
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
|
//Log.logInfo("SearchEvent", "added a container, size = " + index.size());
|
|
|
|
this.addRunning = true;
|
|
assert (index != null);
|
|
if (index.isEmpty()) return;
|
|
if (local) {
|
|
assert fullResource >= 0 : "fullResource = " + fullResource;
|
|
this.local_rwi_stored.addAndGet(fullResource);
|
|
} else {
|
|
assert fullResource >= 0 : "fullResource = " + fullResource;
|
|
this.remote_rwi_stored.addAndGet(fullResource);
|
|
this.remote_rwi_peerCount.incrementAndGet();
|
|
}
|
|
long timer = System.currentTimeMillis();
|
|
|
|
// normalize entries
|
|
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime, local);
|
|
int is = index.size();
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
|
|
this.query.id(true),
|
|
SearchEventType.NORMALIZING,
|
|
resourceName,
|
|
is,
|
|
System.currentTimeMillis() - timer), false);
|
|
if (!local) this.receivedRemoteReferences.addAndGet(is);
|
|
|
|
// iterate over normalized entries and select some that are better than currently stored
|
|
timer = System.currentTimeMillis();
|
|
|
|
// apply all constraints
|
|
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
|
|
try {
|
|
WordReferenceVars iEntry;
|
|
long remaining;
|
|
pollloop: while ( true ) {
|
|
remaining = timeout - System.currentTimeMillis();
|
|
if (remaining <= 0) {
|
|
ConcurrentLog.warn("SearchEvent", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
|
break;
|
|
}
|
|
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
|
|
if (iEntry == null) {
|
|
ConcurrentLog.warn("SearchEvent", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
|
|
break pollloop;
|
|
}
|
|
if (iEntry == WordReferenceVars.poison) {
|
|
break pollloop;
|
|
}
|
|
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
|
|
|
|
// doublecheck for urls
|
|
if (this.urlhashes.has(iEntry.urlhash())) {
|
|
if (log.isFine()) log.fine("dropped RWI: doublecheck");
|
|
continue pollloop;
|
|
}
|
|
|
|
// increase flag counts
|
|
Bitfield flags = iEntry.flags();
|
|
for (int j = 0; j < 32; j++) {
|
|
if (flags.get(j)) this.flagcount[j]++;
|
|
}
|
|
|
|
// check constraints
|
|
if (!this.testFlags(flags)) {
|
|
if (log.isFine()) log.fine("dropped RWI: flag test failed");
|
|
continue pollloop;
|
|
}
|
|
|
|
// check document domain
|
|
if (this.query.contentdom.getCode() > 0 &&
|
|
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
|
|
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
|
|
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
|
|
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
|
|
if (log.isFine()) log.fine("dropped RWI: contentdom fail");
|
|
continue pollloop;
|
|
}
|
|
|
|
// count domZones
|
|
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
|
|
|
|
// check site constraints
|
|
final String hosthash = iEntry.hosthash();
|
|
if ( this.query.modifier.sitehash == null ) {
|
|
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
|
|
if (log.isFine()) log.fine("dropped RWI: siteexcludes");
|
|
continue pollloop;
|
|
}
|
|
} else {
|
|
// filter out all domains that do not match with the site constraint
|
|
if (!hosthash.equals(this.query.modifier.sitehash)) {
|
|
if (log.isFine()) log.fine("dropped RWI: modifier.sitehash");
|
|
continue pollloop;
|
|
}
|
|
}
|
|
|
|
// finally extend the double-check and insert result to stack
|
|
this.urlhashes.putUnique(iEntry.urlhash());
|
|
rankingtryloop: while (true) {
|
|
try {
|
|
this.rwiStack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
|
|
break rankingtryloop;
|
|
} catch (final ArithmeticException e ) {
|
|
// this may happen if the concurrent normalizer changes values during cardinal computation
|
|
if (log.isFine()) log.fine("dropped RWI: arithmetic exception");
|
|
continue rankingtryloop;
|
|
}
|
|
}
|
|
// increase counter for statistics
|
|
if (local) this.local_rwi_available.incrementAndGet(); else this.remote_rwi_available.incrementAndGet();
|
|
}
|
|
if (System.currentTimeMillis() >= timeout) ConcurrentLog.warn("SearchEvent", "rwi normalization ended with timeout = " + maxtime);
|
|
|
|
} catch (final InterruptedException e ) {
|
|
} catch (final SpaceExceededException e ) {
|
|
}
|
|
|
|
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
|
|
this.query.id(true),
|
|
SearchEventType.PRESORT,
|
|
resourceName,
|
|
index.size(),
|
|
System.currentTimeMillis() - timer), false);
|
|
}
|
|
|
|
public long getEventTime() {
|
|
return this.eventTime;
|
|
}
|
|
|
|
protected void resetEventTime() {
|
|
this.eventTime = System.currentTimeMillis();
|
|
}
|
|
|
|
protected void cleanup() {
|
|
|
|
// stop all threads
|
|
if (this.localsolrsearch != null) {
|
|
if (localsolrsearch.isAlive()) synchronized (this.localsolrsearch) {this.localsolrsearch.interrupt();}
|
|
}
|
|
if (this.nodeSearchThreads != null) {
|
|
for (final Thread search : this.nodeSearchThreads) {
|
|
if (search != null) {
|
|
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
|
|
}
|
|
}
|
|
}
|
|
if (this.primarySearchThreadsL != null) {
|
|
for (final RemoteSearch search : this.primarySearchThreadsL) {
|
|
if (search != null) {
|
|
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
|
|
}
|
|
}
|
|
}
|
|
if (this.secondarySearchThreads != null) {
|
|
for (final Thread search : this.secondarySearchThreads ) {
|
|
if (search != null) {
|
|
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
|
|
}
|
|
}
|
|
}
|
|
|
|
// clear all data structures
|
|
if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
|
|
if (this.localSearchThread != null && this.localSearchThread.isAlive()) this.localSearchThread.interrupt();
|
|
if (this.IACount != null) this.IACount.clear();
|
|
if (this.IAResults != null) this.IAResults.clear();
|
|
if (this.heuristics != null) this.heuristics.clear();
|
|
this.rwiStack.clear();
|
|
this.nodeStack.clear();
|
|
this.resultList.clear();
|
|
}
|
|
|
|
public String abstractsString(final byte[] hash) {
|
|
return this.IAResults.get(hash);
|
|
}
|
|
|
|
public Iterator<Map.Entry<byte[], Integer>> abstractsCount() {
|
|
return this.IACount.entrySet().iterator();
|
|
}
|
|
|
|
public int abstractsCount(final byte[] hash) {
|
|
final Integer i = this.IACount.get(hash);
|
|
if ( i == null ) {
|
|
return -1;
|
|
}
|
|
return i.intValue();
|
|
}
|
|
|
|
public byte[] getAbstractsMaxCountHash() {
|
|
return this.IAmaxcounthash;
|
|
}
|
|
|
|
public byte[] getAbstractsNearDHTHash() {
|
|
return this.IAneardhthash;
|
|
}
|
|
|
|
public List<RemoteSearch> getPrimarySearchThreads() {
|
|
return this.primarySearchThreadsL;
|
|
}
|
|
|
|
public Thread[] getSecondarySearchThreads() {
|
|
return this.secondarySearchThreads;
|
|
}
|
|
|
|
public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) {
|
|
synchronized ( this.heuristics ) {
|
|
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
|
|
}
|
|
}
|
|
|
|
public HeuristicResult getHeuristic(final byte[] urlhash) {
|
|
synchronized ( this.heuristics ) {
|
|
return this.heuristics.get(urlhash);
|
|
}
|
|
}
|
|
|
|
public void addNodes(
|
|
final List<URIMetadataNode> nodeList,
|
|
final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
|
|
final Map<String, String> solrsnippets, // a map from urlhash to snippet text
|
|
final boolean local,
|
|
final String resourceName,
|
|
final int fullResource) {
|
|
|
|
this.addBegin();
|
|
|
|
// check if all results have snippets
|
|
/*
|
|
for (URIMetadataNode node: nodeList) {
|
|
if (!facets.containsKey(ASCII.String(node.hash()))) {
|
|
log.logInfo("no snippet from Solr for " + node.url().toNormalform(true));
|
|
}
|
|
}
|
|
*/
|
|
this.snippets.putAll(solrsnippets);
|
|
assert (nodeList != null);
|
|
if (nodeList.isEmpty()) return;
|
|
|
|
if (local) {
|
|
this.local_solr_stored.set(fullResource);
|
|
} else {
|
|
assert fullResource >= 0 : "fullResource = " + fullResource;
|
|
this.remote_solr_stored.addAndGet(fullResource);
|
|
this.remote_solr_peerCount.incrementAndGet();
|
|
}
|
|
|
|
long timer = System.currentTimeMillis();
|
|
|
|
// normalize entries
|
|
int is = nodeList.size();
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.NORMALIZING, resourceName, is, System.currentTimeMillis() - timer), false);
|
|
if (!local) {
|
|
this.receivedRemoteReferences.addAndGet(is);
|
|
}
|
|
|
|
// iterate over normalized entries and select some that are better than currently stored
|
|
timer = System.currentTimeMillis();
|
|
|
|
// collect navigation information
|
|
ReversibleScoreMap<String> fcts;
|
|
if (this.locationNavigator != null) {
|
|
fcts = facets.get(CollectionSchema.coordinate_p.getSolrFieldName());
|
|
if (fcts != null) {
|
|
for (String coordinate: fcts) {
|
|
int hc = fcts.get(coordinate);
|
|
if (hc == 0) continue;
|
|
this.locationNavigator.inc(coordinate, hc);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (this.hostNavigator != null) {
|
|
fcts = facets.get(CollectionSchema.host_s.getSolrFieldName());
|
|
if (fcts != null) {
|
|
for (String host: fcts) {
|
|
int hc = fcts.get(host);
|
|
if (hc == 0) continue;
|
|
if (host.startsWith("www.")) host = host.substring(4);
|
|
this.hostNavigator.inc(host, hc);
|
|
}
|
|
//this.hostNavigator.inc(fcts);
|
|
}
|
|
}
|
|
|
|
if (this.filetypeNavigator != null) {
|
|
fcts = facets.get(CollectionSchema.url_file_ext_s.getSolrFieldName());
|
|
if (fcts != null) {
|
|
// remove all filetypes that we don't know
|
|
Iterator<String> i = fcts.iterator();
|
|
while (i.hasNext()) {
|
|
String ext = i.next();
|
|
if (this.query.contentdom == ContentDomain.TEXT) {
|
|
if ((Classification.isImageExtension(ext) && this.excludeintext_image) ||
|
|
(TextParser.supportsExtension(ext) != null && !Classification.isAnyKnownExtension(ext))) {
|
|
//Log.logInfo("SearchEvent", "removed unknown extension " + ext + " from navigation.");
|
|
i.remove();
|
|
}
|
|
}
|
|
}
|
|
this.filetypeNavigator.inc(fcts);
|
|
}
|
|
}
|
|
|
|
if (this.authorNavigator != null) {
|
|
fcts = facets.get(CollectionSchema.author_sxt.getSolrFieldName());
|
|
if (fcts != null) this.authorNavigator.inc(fcts);
|
|
}
|
|
|
|
if (this.protocolNavigator != null) {
|
|
fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName());
|
|
if (fcts != null) {
|
|
// remove all protocols that we don't know
|
|
Iterator<String> i = fcts.iterator();
|
|
while (i.hasNext()) {
|
|
String protocol = i.next();
|
|
if ("http,https,smb,ftp,file".indexOf(protocol) < 0) i.remove();
|
|
}
|
|
this.protocolNavigator.inc(fcts);
|
|
}
|
|
}
|
|
|
|
// get the vocabulary navigation
|
|
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
|
|
fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
|
|
if (fcts != null) {
|
|
ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
|
|
if (vocNav == null) {
|
|
vocNav = new ConcurrentScoreMap<String>();
|
|
this.vocabularyNavigator.put(v.getName(), vocNav);
|
|
}
|
|
vocNav.inc(fcts);
|
|
}
|
|
}
|
|
|
|
// apply all constraints
|
|
try {
|
|
pollloop: for (URIMetadataNode iEntry: nodeList) {
|
|
|
|
if ( !this.query.urlMask_isCatchall ) {
|
|
// check url mask
|
|
if (!iEntry.matches(this.query.urlMask)) {
|
|
if (log.isFine()) log.fine("dropped Node: url mask does not match");
|
|
continue pollloop;
|
|
}
|
|
}
|
|
|
|
// doublecheck for urls
|
|
if (this.urlhashes.has(iEntry.hash())) {
|
|
if (log.isFine()) log.fine("dropped Node: double check");
|
|
continue pollloop;
|
|
}
|
|
|
|
// increase flag counts
|
|
for ( int j = 0; j < 32; j++ ) {
|
|
if (iEntry.flags().get(j)) this.flagCount()[j]++;
|
|
}
|
|
|
|
// check constraints
|
|
Bitfield flags = iEntry.flags();
|
|
if (!this.testFlags(flags)) {
|
|
if (log.isFine()) log.fine("dropped Node: flag test");
|
|
continue pollloop;
|
|
}
|
|
|
|
// check document domain
|
|
if (this.query.contentdom.getCode() > 0 &&
|
|
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Condenser.flag_cat_hasaudio))) ||
|
|
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Condenser.flag_cat_hasvideo))) ||
|
|
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Condenser.flag_cat_hasimage))) ||
|
|
(this.query.contentdom == ContentDomain.APP && !(flags.get(Condenser.flag_cat_hasapp))))) {
|
|
if (log.isFine()) log.fine("dropped Node: content domain does not match");
|
|
continue pollloop;
|
|
}
|
|
|
|
// filter out media links in text search, if wanted
|
|
String ext = MultiProtocolURL.getFileExtension(iEntry.url().getFileName());
|
|
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
|
|
if (log.isFine()) log.fine("dropped Node: file name domain does not match");
|
|
continue pollloop;
|
|
}
|
|
|
|
// check site constraints
|
|
final String hosthash = iEntry.hosthash();
|
|
if ( this.query.modifier.sitehash == null ) {
|
|
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
|
|
if (log.isFine()) log.fine("dropped Node: siteexclude");
|
|
continue pollloop;
|
|
}
|
|
} else {
|
|
// filter out all domains that do not match with the site constraint
|
|
if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
|
|
if (log.isFine()) log.fine("dropped Node: sitehost");
|
|
continue pollloop;
|
|
}
|
|
}
|
|
|
|
// finally extend the double-check and insert result to stack
|
|
this.urlhashes.putUnique(iEntry.hash());
|
|
rankingtryloop: while (true) {
|
|
try {
|
|
long score = iEntry.ranking();
|
|
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, score == 0 ? this.order.cardinal(iEntry) : score)); // inserts the element and removes the worst (which is smallest)
|
|
break rankingtryloop;
|
|
} catch (final ArithmeticException e ) {
|
|
// this may happen if the concurrent normalizer changes values during cardinal computation
|
|
continue rankingtryloop;
|
|
}
|
|
}
|
|
// increase counter for statistics
|
|
if (local) this.local_solr_available.incrementAndGet(); else this.remote_solr_available.incrementAndGet();
|
|
}
|
|
} catch (final SpaceExceededException e ) {
|
|
}
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.PRESORT, resourceName, nodeList.size(), System.currentTimeMillis() - timer), false);
|
|
}
|
|
|
|
public void addExpectedRemoteReferences(int x) {
|
|
if ( x > 0 ) {
|
|
this.maxExpectedRemoteReferences.addAndGet(x);
|
|
}
|
|
this.expectedRemoteReferences.addAndGet(x);
|
|
}
|
|
|
|
/**
|
|
* Take one best entry from the rwiStack and create a node entry out of it.
|
|
* There is no waiting or blocking; if no entry is available this just returns null
|
|
* If the sjupDoubleDom option is selected, only different hosts are returned until no such rwi exists.
|
|
* Then the best entry from domain stacks are returned.
|
|
* @param skipDoubleDom
|
|
* @return a node from a rwi entry if one exist or null if not
|
|
*/
|
|
private URIMetadataNode pullOneRWI(final boolean skipDoubleDom) {
|
|
|
|
// returns from the current RWI list the best entry and removes this entry from the list
|
|
WeakPriorityBlockingQueue<WordReferenceVars> m;
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
|
|
|
|
mainloop: while (true) {
|
|
int c = 0;
|
|
pollloop: while (this.rwiStack.sizeQueue() > 0 && c++ < 10) {
|
|
rwi = this.rwiStack.poll();
|
|
if (rwi == null) return null;
|
|
if (!skipDoubleDom) {
|
|
URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi);
|
|
if (node == null) continue pollloop;
|
|
return node;
|
|
}
|
|
|
|
// check doubledom
|
|
final String hosthash = rwi.getElement().hosthash();
|
|
m = this.doubleDomCache.get(hosthash);
|
|
if (m == null) {
|
|
synchronized ( this.doubleDomCache ) {
|
|
m = this.doubleDomCache.get(hosthash);
|
|
if (m == null) {
|
|
// first appearance of dom. we create an entry to signal that one of that domain was already returned
|
|
m = new WeakPriorityBlockingQueue<WordReferenceVars>(max_results_rwi, false);
|
|
this.doubleDomCache.put(hosthash, m);
|
|
URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi);
|
|
if (node == null) continue pollloop;
|
|
return node;
|
|
}
|
|
// second appearances of dom
|
|
m.put(rwi);
|
|
}
|
|
} else {
|
|
m.put(rwi);
|
|
}
|
|
}
|
|
|
|
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
|
|
if (this.doubleDomCache.isEmpty()) {
|
|
//Log.logWarning("SearchEvent", "doubleDomCache.isEmpty");
|
|
return null;
|
|
}
|
|
|
|
// find best entry from all caches
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
|
|
WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
|
|
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
|
|
doubleloop: while (i.hasNext()) {
|
|
try {
|
|
m = i.next();
|
|
} catch (final ConcurrentModificationException e) {
|
|
ConcurrentLog.logException(e);
|
|
continue mainloop; // not the best solution...
|
|
}
|
|
if (m == null) continue doubleloop;
|
|
if (m.isEmpty()) continue doubleloop;
|
|
if (bestEntry == null) {
|
|
bestEntry = m.peek();
|
|
continue doubleloop;
|
|
}
|
|
o = m.peek();
|
|
if (o == null) continue doubleloop;
|
|
if (o.getWeight() < bestEntry.getWeight()) bestEntry = o;
|
|
}
|
|
if (bestEntry == null) {
|
|
//Log.logWarning("SearchEvent", "bestEntry == null (1)");
|
|
return null;
|
|
}
|
|
|
|
// finally remove the best entry from the doubledom cache
|
|
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
|
|
if (m != null) {
|
|
bestEntry = m.poll();
|
|
if (bestEntry != null && m.sizeAvailable() == 0) {
|
|
synchronized ( this.doubleDomCache ) {
|
|
if (m.sizeAvailable() == 0) {
|
|
this.doubleDomCache.remove(bestEntry.getElement().hosthash());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (bestEntry == null) {
|
|
//Log.logWarning("SearchEvent", "bestEntry == null (2)");
|
|
return null;
|
|
}
|
|
URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(bestEntry);
|
|
if (node == null) {
|
|
if (bestEntry.getElement().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
if (log.isFine()) log.fine("dropped RWI: hash not in metadata");
|
|
continue mainloop;
|
|
}
|
|
return node;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* get one metadata entry from the ranked results. This will be the 'best' entry so far according to the
|
|
* applied ranking. If there are no more entries left or the timeout limit is reached then null is
|
|
* returned. The caller may distinguish the timeout case from the case where there will be no more also in
|
|
* the future by calling this.feedingIsFinished()
|
|
*
|
|
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
|
|
* @return a metadata entry for a url
|
|
*/
|
|
public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
|
|
// returns from the current RWI list the best URL entry and removes this entry from the list
|
|
int p = -1;
|
|
URIMetadataNode page;
|
|
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
|
|
|
|
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMask)) {
|
|
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// check for more errors
|
|
if (page.url() == null) {
|
|
if (log.isFine()) log.fine("dropped RWI: url == null");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue; // rare case where the url is corrupted
|
|
}
|
|
|
|
// check content domain
|
|
ContentDomain contentDomain = page.getContentDomain();
|
|
if (this.query.contentdom.getCode() > 0 && (
|
|
(this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) ||
|
|
(this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) ||
|
|
(this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) ||
|
|
(this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
|
|
if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain);
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// filter out media links in text search, if wanted
|
|
String ext = MultiProtocolURL.getFileExtension(page.url().getFileName());
|
|
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
|
|
if (log.isFine()) log.fine("dropped RWI: file name domain does not match");
|
|
continue;
|
|
}
|
|
|
|
// Check for blacklist
|
|
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page.url())) {
|
|
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// content control
|
|
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
|
|
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
|
|
if (f != null && !f.isListed(page.url(), null)) {
|
|
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
}
|
|
|
|
final String pageurl = page.url().toNormalform(true);
|
|
final String pageauthor = page.dc_creator();
|
|
final String pagetitle = page.dc_title().toLowerCase();
|
|
|
|
// check exclusion
|
|
if (this.query.getQueryGoal().getExcludeSize() != 0 &&
|
|
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
|
|
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
|
|
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
|
|
if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// check index-of constraint
|
|
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof)) && (!(pagetitle.startsWith("index of")))) {
|
|
final Iterator<byte[]> wi = this.query.getQueryGoal().getIncludeHashes().iterator();
|
|
if (this.query.getSegment().termIndex() != null) {
|
|
while (wi.hasNext()) {
|
|
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
|
|
}
|
|
}
|
|
if (log.isFine()) log.fine("dropped RWI: url does not match index-of constraint");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// check location constraint
|
|
if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) {
|
|
if (log.isFine()) log.fine("dropped RWI: location constraint");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
// check geo coordinates
|
|
double lat, lon;
|
|
if (this.query.radius > 0.0d && this.query.lat != 0.0d && this.query.lon != 0.0d && (lat = page.lat()) != 0.0d && (lon = page.lon()) != 0.0d) {
|
|
double latDelta = this.query.lat - lat;
|
|
double lonDelta = this.query.lon - lon;
|
|
double distance = Math.sqrt(latDelta * latDelta + lonDelta * lonDelta); // pythagoras
|
|
if (distance > this.query.radius) {
|
|
if (log.isFine()) log.fine("dropped RWI: radius constraint");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// check Scanner
|
|
if (this.query.filterscannerfail && !Scanner.acceptURL(page.url())) {
|
|
if (log.isFine()) log.fine("dropped RWI: url not accepted by scanner");
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue;
|
|
}
|
|
|
|
|
|
// check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
|
|
// TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
|
|
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
|
|
tagloop: for (Tagging.Metatag tag : this.query.metatags) {
|
|
SolrDocument sdoc = page.getDocument();
|
|
if (sdoc != null) {
|
|
Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX);
|
|
if (tagvalues != null && tagvalues.contains(tag.getObject())) {
|
|
continue tagloop; // metatag exists check next tag (filter may consist of several tags)
|
|
}
|
|
} // if we reach this point the metatag was not found (= drop entry)
|
|
if (log.isFine()) log.fine("dropped RWI: url not tagged with vocabulary " + tag.getVocabularyName());
|
|
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
|
continue mainloop;
|
|
}
|
|
}
|
|
|
|
// from here: collect navigation information
|
|
|
|
// namespace navigation
|
|
if (this.namespaceNavigator != null) {
|
|
String pagepath = page.url().getPath();
|
|
if ((p = pagepath.indexOf(':')) >= 0) {
|
|
pagepath = pagepath.substring(0, p);
|
|
p = pagepath.lastIndexOf('/');
|
|
if (p >= 0) {
|
|
pagepath = pagepath.substring(p + 1);
|
|
this.namespaceNavigator.inc(pagepath);
|
|
}
|
|
}
|
|
}
|
|
|
|
return page; // accept url
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public long getURLRetrievalTime() {
|
|
return this.urlRetrievalAllTime;
|
|
}
|
|
|
|
public long getSnippetComputationTime() {
|
|
return this.snippetComputationAllTime;
|
|
}
|
|
|
|
public ScoreMap<String> getTopicNavigator(final int count ) {
|
|
if (this.topicNavigatorCount > 0 && count >= 0) { //topicNavigatorCount set during init, 0=no nav
|
|
return this.getTopics(count != 0 ? count : this.topicNavigatorCount, 500);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public boolean drainStacksToResult() {
|
|
// we take one entry from both stacks at the same time
|
|
boolean success = false;
|
|
Element<URIMetadataNode> localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null;
|
|
URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement();
|
|
if (node != null) {
|
|
String solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
|
|
if (solrsnippet != null && solrsnippet.length() > 0) {
|
|
final TextSnippet snippet = new TextSnippet(node.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
|
|
ResultEntry re = new ResultEntry(node, this.query.getSegment(), this.peers, snippet, null, 0);
|
|
addResult(re);
|
|
success = true;
|
|
} else {
|
|
// we don't have a snippet from solr, try to get it in our way (by reloading, if necessary)
|
|
if (SearchEvent.this.snippetFetchAlive.get() >= 10) {
|
|
// too many concurrent processes
|
|
addResult(getSnippet(node, null));
|
|
success = true;
|
|
} else {
|
|
final URIMetadataNode node1 = node;
|
|
new Thread() {
|
|
public void run() {
|
|
SearchEvent.this.oneFeederStarted();
|
|
try {
|
|
SearchEvent.this.snippetFetchAlive.incrementAndGet();
|
|
try {
|
|
addResult(getSnippet(node1, SearchEvent.this.query.snippetCacheStrategy));
|
|
} catch (final Throwable e) {} finally {
|
|
SearchEvent.this.snippetFetchAlive.decrementAndGet();
|
|
}
|
|
} catch (final Throwable e) {} finally {
|
|
SearchEvent.this.oneFeederTerminated();
|
|
}
|
|
}
|
|
}.start();
|
|
}
|
|
}
|
|
}
|
|
if (SearchEvent.this.snippetFetchAlive.get() >= 10) {
|
|
// too many concurrent processes
|
|
node = pullOneFilteredFromRWI(true);
|
|
if (node != null) {
|
|
addResult(getSnippet(node, null));
|
|
success = true;
|
|
}
|
|
} else {
|
|
new Thread() {
|
|
public void run() {
|
|
SearchEvent.this.oneFeederStarted();
|
|
try {
|
|
final URIMetadataNode node = pullOneFilteredFromRWI(true);
|
|
if (node != null) {
|
|
SearchEvent.this.snippetFetchAlive.incrementAndGet();
|
|
try {
|
|
addResult(getSnippet(node, SearchEvent.this.query.snippetCacheStrategy));
|
|
} catch (final Throwable e) {} finally {
|
|
SearchEvent.this.snippetFetchAlive.decrementAndGet();
|
|
}
|
|
}
|
|
} catch (final Throwable e) {} finally {
|
|
SearchEvent.this.oneFeederTerminated();
|
|
}
|
|
}
|
|
}.start();
|
|
}
|
|
return success;
|
|
}
|
|
|
|
/**
|
|
* place the result to the result vector and apply post-ranking
|
|
* @param resultEntry
|
|
*/
|
|
public void addResult(ResultEntry resultEntry) {
|
|
if (resultEntry == null) return;
|
|
long ranking = resultEntry.ranking();
|
|
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
|
|
this.resultList.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
|
|
if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries.
|
|
this.addTopics(resultEntry);
|
|
}
|
|
|
|
private long postRanking(final ResultEntry rentry, final ScoreMap<String> topwords) {
|
|
long r = 0;
|
|
|
|
// for media search: prefer pages with many links
|
|
r += rentry.limage() << this.query.ranking.coeff_cathasimage;
|
|
r += rentry.laudio() << this.query.ranking.coeff_cathasaudio;
|
|
r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo;
|
|
r += rentry.lapp() << this.query.ranking.coeff_cathasapp;
|
|
|
|
// apply citation count
|
|
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
|
|
r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.query.ranking.coeff_citation;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
if (this.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) r += 256 << this.query.ranking.coeff_prefer;
|
|
if (this.query.prefer.matcher(rentry.title()).matches()) r += 256 << this.query.ranking.coeff_prefer;
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
final String urlstring = rentry.url().toNormalform(true);
|
|
final String[] urlcomps = MultiProtocolURL.urlComps(urlstring);
|
|
final String[] descrcomps = MultiProtocolURL.splitpattern.split(rentry.title().toLowerCase());
|
|
for (final String urlcomp : urlcomps) {
|
|
int tc = topwords.get(urlcomp);
|
|
if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_urlcompintoplist;
|
|
}
|
|
for (final String descrcomp : descrcomps) {
|
|
int tc = topwords.get(descrcomp);
|
|
if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_descrcompintoplist;
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps);
|
|
final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps);
|
|
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords();
|
|
String queryword;
|
|
while (shi.hasNext()) {
|
|
queryword = shi.next();
|
|
if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl;
|
|
if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
public ResultEntry getSnippet(URIMetadataNode page, final CacheStrategy cacheStrategy) {
|
|
if (page == null) return null;
|
|
|
|
if (cacheStrategy == null) {
|
|
final TextSnippet snippet = new TextSnippet(
|
|
null,
|
|
page,
|
|
this.snippetFetchWordHashes,
|
|
null,
|
|
((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
SearchEvent.SNIPPET_MAX_LENGTH,
|
|
!this.query.isLocal());
|
|
return new ResultEntry(page, this.query.getSegment(), this.peers, snippet, null, 0); // result without snippet
|
|
}
|
|
|
|
// load snippet
|
|
ContentDomain contentDomain = page.getContentDomain();
|
|
if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) {
|
|
// attach text snippet
|
|
long startTime = System.currentTimeMillis();
|
|
final TextSnippet snippet = new TextSnippet(
|
|
this.loader,
|
|
page,
|
|
this.snippetFetchWordHashes,
|
|
cacheStrategy,
|
|
((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
180,
|
|
!this.query.isLocal());
|
|
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
|
SearchEvent.log.info("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
|
|
|
if (!snippet.getErrorCode().fail()) {
|
|
// we loaded the file and found the snippet
|
|
return new ResultEntry(page, this.query.getSegment(), this.peers, snippet, null, snippetComputationTime); // result with snippet attached
|
|
} else if (cacheStrategy.mustBeOffline()) {
|
|
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
|
|
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
|
|
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, snippetComputationTime); // result without snippet
|
|
} else {
|
|
// problems with snippet fetch
|
|
if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
|
|
// we accept that because the word cannot be on the page
|
|
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, 0);
|
|
}
|
|
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
|
|
if (this.deleteIfSnippetFail) {
|
|
this.workTables.failURLsRegisterMissingWord(this.query.getSegment().termIndex(), page.url(), this.query.getQueryGoal().getIncludeHashes(), reason);
|
|
}
|
|
SearchEvent.log.info("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
|
|
return null;
|
|
}
|
|
}
|
|
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, 0); // result without snippet
|
|
}
|
|
|
|
public ResultEntry oneResult(final int item, final long timeout) {
|
|
// check if we already retrieved this item
|
|
// (happens if a search pages is accessed a second time)
|
|
final long finishTime = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + timeout;
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "started, item = " + item + ", available = " + this.getResultCount(), 0, 0), false);
|
|
|
|
// wait until a local solr is finished, we must do that to be able to check if we need more
|
|
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join(100);} catch (final InterruptedException e) {}}
|
|
if (item >= this.localsolroffset && this.local_solr_stored.get() == 0 && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
|
|
if (item >= this.localsolroffset && this.local_solr_stored.get() >= item) {
|
|
// load remaining solr results now
|
|
int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded.
|
|
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
|
|
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
|
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, false, this.excludeintext_image), this.localsolroffset, nextitems, null /*this peer*/, 0, Switchboard.urlBlacklist);
|
|
}
|
|
this.localsolroffset += nextitems;
|
|
}
|
|
|
|
// now pull results as long as needed and as long as possible
|
|
while ( this.resultList.sizeAvailable() <= item &&
|
|
(this.rwiQueueSize() > 0 || this.nodeStack.sizeQueue() > 0 ||
|
|
(!this.feedingIsFinished() && System.currentTimeMillis() < finishTime))) {
|
|
if (!drainStacksToResult()) try {Thread.sleep(10);} catch (final InterruptedException e) {ConcurrentLog.logException(e);}
|
|
}
|
|
|
|
// check if we have a success
|
|
if (this.resultList.sizeAvailable() > item) {
|
|
// we have the wanted result already in the result array .. return that
|
|
final ResultEntry re = this.resultList.element(item).getElement();
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "fetched, item = " + item + ", available = " + this.getResultCount() + ": " + re.urlstring(), 0, 0), false);
|
|
|
|
if (this.localsolrsearch == null || !this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0) {
|
|
// at the end of a list, trigger a next solr search
|
|
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
|
|
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, false, this.excludeintext_image), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, 0, Switchboard.urlBlacklist);
|
|
}
|
|
this.localsolroffset += this.query.itemsPerPage;
|
|
}
|
|
return re;
|
|
}
|
|
|
|
// no success
|
|
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "not found, item = " + item + ", available = " + this.getResultCount(), 0, 0), false);
|
|
return null;
|
|
}
|
|
|
|
private LinkedHashMap<String, ImageResult> imageViewed = new LinkedHashMap<String, ImageResult>();
|
|
private LinkedHashMap<String, ImageResult> imageSpare = new LinkedHashMap<String, ImageResult>();
|
|
private ImageResult nthImage(int item) {
|
|
Object o = SetTools.nth(this.imageViewed.values(), item);
|
|
if (o == null) return null;
|
|
return (ImageResult) o;
|
|
}
|
|
private ImageResult nextSpare() {
|
|
Map.Entry<String, ImageResult> next = imageSpare.entrySet().iterator().next();
|
|
imageViewed.put(next.getKey(), next.getValue());
|
|
imageSpare.remove(next.getKey());
|
|
return next.getValue();
|
|
}
|
|
|
|
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
|
|
if (item < imageViewed.size()) return nthImage(item);
|
|
|
|
ResultEntry ms = oneResult(item, timeout);
|
|
// check if the match was made in the url or in the image links
|
|
if (ms != null) {
|
|
SolrDocument doc = ms.getNode().getDocument();
|
|
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
|
|
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
|
|
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
|
|
if (img != null) {
|
|
int c = 0;
|
|
for (Object i: img) {
|
|
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
|
|
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
|
|
try {
|
|
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
|
|
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
|
|
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
|
|
String id = ASCII.String(imageUrl.hash());
|
|
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
|
|
} catch (MalformedURLException e) {
|
|
continue;
|
|
}
|
|
}
|
|
c++;
|
|
}
|
|
}
|
|
if (MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(ms.url().getFileName()))) {
|
|
String id = ASCII.String(ms.hash());
|
|
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
|
|
}
|
|
if (img != null && img.size() > 0) {
|
|
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
|
|
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
|
|
String id = ASCII.String(imageUrl.hash());
|
|
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
|
|
}
|
|
}
|
|
if (imageSpare.size() > 0) return nextSpare();
|
|
throw new MalformedURLException("no image url found");
|
|
}
|
|
|
|
public class ImageResult {
|
|
public DigestURL imageUrl, sourceUrl;
|
|
public String mimetype = "", imagetext = "";
|
|
public int width = 0, height = 0, fileSize = 0;
|
|
public ImageResult(DigestURL sourceUrl, DigestURL imageUrl, String mimetype, String imagetext, int width, int height, int fileSize) {
|
|
this.sourceUrl = sourceUrl;
|
|
this.imageUrl = imageUrl;
|
|
this.mimetype = mimetype;
|
|
this.imagetext = imagetext;
|
|
this.width = width;
|
|
this.height = height;
|
|
this.fileSize = fileSize;
|
|
}
|
|
public String toString() {
|
|
return this.imageUrl.toString();
|
|
}
|
|
}
|
|
|
|
public ArrayList<WeakPriorityBlockingQueue.Element<ResultEntry>> completeResults(final long waitingtime) {
|
|
final long timeout = waitingtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + waitingtime;
|
|
int i = 0;
|
|
while (this.resultList.sizeAvailable() < this.query.neededResults() && System.currentTimeMillis() < timeout) {
|
|
oneResult(i++, timeout - System.currentTimeMillis());
|
|
}
|
|
return this.resultList.list(Math.min(this.query.neededResults(), this.resultList.sizeAvailable()));
|
|
}
|
|
|
|
/**
|
|
* delete a specific entry from the search results
|
|
* this is used if the user clicks on a '-' sign beside the search result
|
|
* @param urlhash
|
|
* @return true if an entry was deleted, false otherwise
|
|
*/
|
|
protected boolean delete(final String urlhash) {
|
|
final Iterator<Element<ResultEntry>> i = this.resultList.iterator();
|
|
Element<ResultEntry> entry;
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
if (urlhash.equals(ASCII.String(entry.getElement().url().hash()))) {
|
|
i.remove();
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public ReferenceOrder getOrder() {
|
|
return this.order;
|
|
}
|
|
|
|
protected boolean feedingIsFinished() {
|
|
return
|
|
this.feedersTerminated.intValue() > (this.remote ? 1 : 0) &&
|
|
this.feedersAlive.get() == 0;
|
|
}
|
|
|
|
/**
|
|
* method to signal the incoming stack that one feeder has terminated
|
|
*/
|
|
public void oneFeederTerminated() {
|
|
this.feedersTerminated.incrementAndGet();
|
|
final int c = this.feedersAlive.decrementAndGet();
|
|
assert c >= 0 : "feeders = " + c;
|
|
}
|
|
|
|
public void oneFeederStarted() {
|
|
this.feedersAlive.incrementAndGet();
|
|
}
|
|
|
|
public QueryParams getQuery() {
|
|
return this.query;
|
|
}
|
|
|
|
public int[] flagCount() {
|
|
return this.flagcount;
|
|
}
|
|
|
|
protected void addBegin() {
|
|
this.addRunning = true;
|
|
}
|
|
|
|
public void addFinalize() {
|
|
this.addRunning = false;
|
|
}
|
|
|
|
protected boolean addRunning() {
|
|
return this.addRunning;
|
|
}
|
|
|
|
public boolean rwiIsEmpty() {
|
|
if ( !this.rwiStack.isEmpty() ) {
|
|
return false;
|
|
}
|
|
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
|
|
if ( !s.isEmpty() ) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
protected int rwiQueueSize() {
|
|
int c = this.rwiStack.sizeQueue();
|
|
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
|
|
c += s.sizeQueue();
|
|
}
|
|
return c;
|
|
}
|
|
|
|
protected boolean testFlags(final Bitfield flags) {
|
|
if (this.query.constraint == null) return true;
|
|
// test if ientry matches with filter
|
|
// if all = true: let only entries pass that has all matching bits
|
|
// if all = false: let all entries pass that has at least one matching bit
|
|
if (this.query.allofconstraint) {
|
|
for ( int i = 0; i < 32; i++ ) {
|
|
if ((this.query.constraint.get(i)) && (!flags.get(i))) return false;
|
|
}
|
|
return true;
|
|
}
|
|
for (int i = 0; i < 32; i++) {
|
|
if ((this.query.constraint.get(i)) && (flags.get(i))) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
|
|
// direct access to the result maps is needed for abstract generation
|
|
// this is only available if execQuery() was called before
|
|
return this.localSearchInclusion;
|
|
}
|
|
|
|
public ScoreMap<String> getTopics(final int maxcount, final long maxtime) {
|
|
// create a list of words that had been computed by statistics over all
|
|
// words that appeared in the url or the description of all urls
|
|
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
|
|
if ( this.ref.sizeSmaller(2) ) {
|
|
this.ref.clear(); // navigators with one entry are not useful
|
|
}
|
|
final Map<String, Float> counts = new HashMap<String, Float>();
|
|
final Iterator<String> i = this.ref.keys(false);
|
|
String word;
|
|
int c;
|
|
float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
|
|
int ic = maxcount;
|
|
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
|
|
while ( ic-- > 0 && i.hasNext() ) {
|
|
word = i.next();
|
|
if ( word == null ) {
|
|
continue;
|
|
}
|
|
c = this.query.getSegment().getWordCountGuess(word);
|
|
if ( c > 0 ) {
|
|
q = ((float) this.ref.get(word)) / ((float) c);
|
|
min = Math.min(min, q);
|
|
max = Math.max(max, q);
|
|
counts.put(word, q);
|
|
}
|
|
if (System.currentTimeMillis() > timeout) break;
|
|
}
|
|
if ( max > min ) {
|
|
for ( final Map.Entry<String, Float> ce : counts.entrySet() ) {
|
|
result.set(ce.getKey(), (int) (((double) maxcount) * (ce.getValue() - min) / (max - min)));
|
|
}
|
|
}
|
|
return this.ref;
|
|
}
|
|
|
|
private final static Pattern lettermatch = Pattern.compile("[a-z]+");
|
|
|
|
public void addTopic(final String[] words) {
|
|
String word;
|
|
for ( final String w : words ) {
|
|
word = w.toLowerCase();
|
|
if ( word.length() > 2
|
|
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
|
|
.indexOf(word) < 0
|
|
&& !this.query.getQueryGoal().containsInclude(word)
|
|
&& lettermatch.matcher(word).matches()
|
|
&& !Switchboard.badwords.contains(word)
|
|
&& !Switchboard.stopwords.contains(word) ) {
|
|
this.ref.inc(word);
|
|
}
|
|
}
|
|
}
|
|
|
|
protected void addTopics(final ResultEntry resultEntry) {
|
|
// take out relevant information for reference computation
|
|
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
|
|
final String[] descrcomps = MultiProtocolURL.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
|
|
|
|
// add references
|
|
addTopic(descrcomps);
|
|
}
|
|
|
|
}
|