mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
535f1ebe3b
- date navigation The date is taken from the CONTENT of the documents / web pages, NOT from a date submitted in the context of metadata (i.e. http header or html head form). This makes it possible to search for documents in the future, i.e. when documents contain event descriptions for future events. The date is written to an index field which is now enabled by default. All documents are scanned for contained date mentions. To visualize the dates for a specific search results, a histogram showing the number of documents for each day is displayed. To render these histograms the morris.js library is used. Morris.js requires also raphael.js which is now also integrated in YaCy. The histogram is now also displayed in the index browser by default. To select a specific range from a search result, the following modifiers had been introduced: from:<date> to:<date> These modifiers can be used separately (i.e. only 'from' or only 'to') to describe an open interval or combined to have a closed interval. Both dates are inclusive. To select a specific single date only, use the 'to:' - modifier. The histogram shows blue and green lines; the green lines denot weekend days (saturday and sunday). Clicking on bars in the histogram has the following reaction: 1st click: add a from:<date> modifier for the date of the bar 2nd click: add a to:<date> modifier for the date of the bar 3rd click: remove from and date modifier and set a on:<date> for the bar When the on:<date> modifier is used, the histogram shows an unlimited time period. This makes it possible to click again (4th click) which is then interpreted as a 1st click again (sets a from modifier). The display feature is NOT switched on by default; to switch it on use the /ConfigSearchPage_p.html servlet.
702 lines
42 KiB
Java
702 lines
42 KiB
Java
/**
|
|
* HostBrowser
|
|
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
|
* First released 27.09.2012 at http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
import java.io.IOException;
|
|
import java.net.MalformedURLException;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.Date;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map.Entry;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.regex.Pattern;
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import net.yacy.cora.federate.solr.FailType;
|
|
import net.yacy.cora.federate.solr.SolrType;
|
|
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
import net.yacy.cora.sorting.ClusteredScoreMap;
|
|
import net.yacy.cora.sorting.ReversibleScoreMap;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.crawler.HarvestProcess;
|
|
import net.yacy.crawler.data.CrawlProfile;
|
|
import net.yacy.crawler.data.NoticedURL.StackType;
|
|
import net.yacy.crawler.retrieval.Request;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.search.SwitchboardConstants;
|
|
import net.yacy.search.index.Fulltext;
|
|
import net.yacy.search.index.Segment.ReferenceReport;
|
|
import net.yacy.search.index.Segment.ReferenceReportCache;
|
|
import net.yacy.search.query.QueryParams;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
import net.yacy.server.serverObjects;
|
|
import net.yacy.server.serverSwitch;
|
|
|
|
public class HostBrowser {
|
|
|
|
final static long TIMEOUT = 10000L;
|
|
|
|
public static enum StoreType {
|
|
LINK, INDEX, EXCLUDED, FAILED, RELOAD;
|
|
}
|
|
|
|
@SuppressWarnings({ "unchecked" })
|
|
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
// return variable that accumulates replacements
|
|
final Switchboard sb = (Switchboard) env;
|
|
Fulltext fulltext = sb.index.fulltext();
|
|
final boolean authorized = sb.verifyAuthentication(header);
|
|
final boolean autoload = authorized && sb.getConfigBool("browser.autoload", true);
|
|
final boolean load4everyone = sb.getConfigBool("browser.load4everyone", false);
|
|
final boolean loadRight = autoload || load4everyone; // add config later
|
|
final boolean searchAllowed = sb.getConfigBool(SwitchboardConstants.PUBLIC_SEARCHPAGE, true) || authorized;
|
|
|
|
final serverObjects prop = new serverObjects();
|
|
|
|
// set default values
|
|
prop.put("path", "");
|
|
prop.put("result", "");
|
|
prop.put("hosts", 0);
|
|
prop.put("files", 0);
|
|
prop.put("hostanalysis", 0);
|
|
|
|
prop.put("admin", "false");
|
|
boolean admin = false;
|
|
|
|
String referer = header.get("Referer", "");
|
|
if ((post != null && post.getBoolean("admin")) || referer.contains("HostBrowser.html?admin=true")) {
|
|
prop.put("topmenu", 2);
|
|
prop.put("admin", "true");
|
|
admin = true;
|
|
} else if (authorized) { // show top nav to admins
|
|
prop.put("topmenu", 1);
|
|
} else { // for other respect setting in Search Design Configuration
|
|
prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
|
|
}
|
|
final String promoteSearchPageGreeting =
|
|
(env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ?
|
|
env.getConfig("network.unit.description", "") :
|
|
env.getConfig(SwitchboardConstants.GREETING, "");
|
|
prop.put("topmenu_promoteSearchPageGreeting", promoteSearchPageGreeting);
|
|
|
|
if (!searchAllowed) {
|
|
prop.put("result", "You are not allowed to use this page. Please ask an administrator for permission.");
|
|
prop.putNum("ucount", 0);
|
|
return prop;
|
|
}
|
|
|
|
String path = post == null ? "" : post.get("path", "").trim();
|
|
if (authorized) sb.index.fulltext().commit(true);
|
|
if (post == null || env == null) {
|
|
prop.putNum("ucount", fulltext.collectionSize());
|
|
return prop;
|
|
}
|
|
|
|
int p = path.lastIndexOf('/');
|
|
if (p < 0 && path.length() > 0) path = path + "/"; else if (p > 7) path = path.substring(0, p + 1); // the search path shall always end with "/"
|
|
if (path.length() > 0 && (
|
|
!path.startsWith("http://") &&
|
|
!path.startsWith("https://") &&
|
|
!path.startsWith("ftp://") &&
|
|
!path.startsWith("smb://") &&
|
|
!path.startsWith("file://"))) { path = "http://" + path; }
|
|
prop.putHTML("path", path);
|
|
prop.put("delete", authorized && path.length() > 0 ? 1 : 0);
|
|
|
|
DigestURL pathURI = null;
|
|
try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {}
|
|
|
|
String load = post.get("load", "");
|
|
boolean wait = false;
|
|
try {
|
|
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
|
|
// in case that the url does not exist and loading is wanted turn this request into a loading request
|
|
load = path;
|
|
wait = true;
|
|
}
|
|
} catch (IOException e1) {
|
|
load = path;
|
|
wait = true;
|
|
}
|
|
if (load.length() > 0 && loadRight) {
|
|
// stack URL
|
|
DigestURL url;
|
|
if (sb.crawlStacker.size() > 2) wait = false;
|
|
try {
|
|
url = new DigestURL(load);
|
|
String reasonString = sb.crawlStacker.stackCrawl(new Request(
|
|
sb.peers.mySeed().hash.getBytes(),
|
|
url, null, load, new Date(),
|
|
sb.crawler.defaultProxyProfile.handle(),
|
|
0
|
|
));
|
|
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
|
|
if (wait) waitloop: for (int i = 0; i < 30; i++) {
|
|
try {
|
|
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
|
|
} catch (IOException e1) {
|
|
e1.printStackTrace();
|
|
break waitloop;
|
|
}
|
|
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
|
}
|
|
} catch (final MalformedURLException e) {
|
|
prop.putHTML("result", "bad url '" + load + "'");
|
|
}
|
|
}
|
|
|
|
if (authorized && post.containsKey("deleteLoadErrors")) {
|
|
try {
|
|
fulltext.getDefaultConnector().deleteByQuery("-" + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND "
|
|
+ CollectionSchema.httpstatus_i.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); // make sure field exists
|
|
ConcurrentLog.info ("HostBrowser:", "delete documents with httpstatus_i <> 200");
|
|
fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.fail.name() + "\"" );
|
|
ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = fail");
|
|
fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.excl.name() + "\"" );
|
|
ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = excl");
|
|
prop.putNum("ucount", fulltext.collectionSize());
|
|
return prop;
|
|
} catch (final IOException ex) {
|
|
ConcurrentLog.logException(ex);
|
|
}
|
|
}
|
|
|
|
if (post.containsKey("hosts")) {
|
|
// generate host list
|
|
try {
|
|
boolean onlyCrawling = "crawling".equals(post.get("hosts", ""));
|
|
boolean onlyErrors = "error".equals(post.get("hosts", ""));
|
|
|
|
int maxcount = authorized ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
|
|
|
|
// collect hosts from index
|
|
ReversibleScoreMap<String> hostscore = fulltext.getDefaultConnector().getFacets(AbstractSolrConnector.CATCHALL_QUERY, maxcount, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
|
|
if (hostscore == null) hostscore = new ClusteredScoreMap<String>(true);
|
|
|
|
// collect hosts from crawler
|
|
final Map<String, Integer[]> crawler = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();
|
|
|
|
// collect the errorurls
|
|
Map<String, ReversibleScoreMap<String>> exclfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null;
|
|
ReversibleScoreMap<String> exclscore = exclfacets == null ? new ClusteredScoreMap<String>(true) : exclfacets.get(CollectionSchema.host_s.getSolrFieldName());
|
|
Map<String, ReversibleScoreMap<String>> failfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null;
|
|
ReversibleScoreMap<String> failscore = failfacets == null ? new ClusteredScoreMap<String>(true) : failfacets.get(CollectionSchema.host_s.getSolrFieldName());
|
|
|
|
int c = 0;
|
|
Iterator<String> i = hostscore.keys(false);
|
|
String host;
|
|
while (i.hasNext() && c < maxcount) {
|
|
host = i.next();
|
|
prop.put("hosts_list_" + c + "_admin", admin ? "true" : "false");
|
|
prop.putHTML("hosts_list_" + c + "_host", host);
|
|
boolean inCrawler = crawler.containsKey(host);
|
|
int exclcount = exclscore.get(host);
|
|
int failcount = failscore.get(host);
|
|
int errors = exclcount + failcount;
|
|
prop.put("hosts_list_" + c + "_count", hostscore.get(host) - errors);
|
|
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
|
|
if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
|
|
prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0);
|
|
if (errors > 0) {
|
|
prop.put("hosts_list_" + c + "_errors_exclcount", exclcount);
|
|
prop.put("hosts_list_" + c + "_errors_failcount", failcount);
|
|
}
|
|
prop.put("hosts_list_" + c + "_type", inCrawler ? 2 : errors > 0 ? 1 : 0);
|
|
if (onlyCrawling) {
|
|
if (inCrawler) c++;
|
|
} else if (onlyErrors) {
|
|
if (errors > 0) c++;
|
|
} else {
|
|
c++;
|
|
}
|
|
}
|
|
prop.put("hosts_list", c);
|
|
prop.put("hosts", 1);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
|
|
if (path.length() > 0) {
|
|
try {
|
|
DigestURL uri = new DigestURL(path);
|
|
String host = uri.getHost();
|
|
|
|
// write host analysis if path after host is empty
|
|
if (uri.getPath().length() <= 1 && host != null && host.length() > 0 && sb.getConfigBool("decoration.hostanalysis", false)) {
|
|
//how many documents per crawldepth_i; get crawldepth_i facet for host
|
|
ArrayList<String> ff = new ArrayList<>();
|
|
for (CollectionSchema csf: CollectionSchema.values()) {
|
|
if ((csf.getType() != SolrType.num_integer && csf.getType() != SolrType.num_long) || csf.isMultiValued()) continue;
|
|
String facetfield = csf.getSolrFieldName();
|
|
if (!fulltext.getDefaultConfiguration().contains(facetfield)) continue;
|
|
ff.add(csf.getSolrFieldName());
|
|
}
|
|
// add also vocabulary counters
|
|
Map<String, ReversibleScoreMap<String>> vocabularyFacet = sb.index.fulltext().getDefaultConnector().getFacets(CollectionSchema.vocabularies_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 100, CollectionSchema.vocabularies_sxt.getSolrFieldName());
|
|
if (vocabularyFacet.size() > 0) {
|
|
Collection<String> vocnames = vocabularyFacet.values().iterator().next().keyList(true);
|
|
for (String vocname: vocnames) {
|
|
ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX);
|
|
ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX);
|
|
}
|
|
}
|
|
// list the facets
|
|
String[] facetfields = ff.toArray(new String[ff.size()]);
|
|
Map<String, ReversibleScoreMap<String>> facets = fulltext.getDefaultConnector().getFacets(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 100, facetfields);
|
|
int fc = 0;
|
|
for (Map.Entry<String, ReversibleScoreMap<String>> facetentry: facets.entrySet()) {
|
|
ReversibleScoreMap<String> facetfieldmap = facetentry.getValue();
|
|
if (facetfieldmap.size() == 0) continue;
|
|
TreeMap<Long, Integer> statMap = new TreeMap<>();
|
|
for (String k: facetfieldmap) statMap.put(Long.parseLong(k), facetfieldmap.get(k));
|
|
prop.put("hostanalysis_facets_" + fc + "_facetname", facetentry.getKey());
|
|
int c = 0; for (Entry<Long, Integer> entry: statMap.entrySet()) {
|
|
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_key", entry.getKey());
|
|
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_count", entry.getValue());
|
|
prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_a", "http://localhost:" + sb.getConfigInt("port", 8090) + "/solr/collection1/select?q=host_s:" + host + " AND " + facetentry.getKey() + ":" + entry.getKey() + "&defType=edismax&start=0&rows=1000&fl=sku,crawldepth_i");
|
|
c++;
|
|
}
|
|
prop.put("hostanalysis_facets_" + fc + "_facet", c);
|
|
fc++;
|
|
}
|
|
prop.put("hostanalysis_facets", fc);
|
|
prop.put("hostanalysis", 1);
|
|
}
|
|
|
|
|
|
// write file list for subpath
|
|
boolean delete = false;
|
|
boolean reload404 = false;
|
|
if (authorized && post.containsKey("delete")) {
|
|
// delete the complete path!! That includes everything that matches with this prefix.
|
|
delete = true;
|
|
}
|
|
if (authorized && post.containsKey("reload404")) {
|
|
// try to re-load all urls that have load errors and matches with this prefix.
|
|
reload404 = true;
|
|
}
|
|
int facetcount=post.getInt("facetcount", 0);
|
|
boolean complete = post.getBoolean("complete");
|
|
if (complete) { // we want only root paths for complete lists
|
|
p = path.indexOf('/', 10);
|
|
if (p > 0) path = path.substring(0, p + 1);
|
|
}
|
|
prop.put("files_complete", complete ? 1 : 0);
|
|
prop.put("files_complete_admin", admin ? "true" : "false");
|
|
prop.putHTML("files_complete_path", path);
|
|
p = path.substring(0, path.length() - 1).lastIndexOf('/');
|
|
if (p < 8) {
|
|
prop.put("files_root", 1);
|
|
} else {
|
|
prop.put("files_root", 0);
|
|
prop.putHTML("files_root_path", path.substring(0, p + 1));
|
|
prop.put("files_root_admin", admin ? "true" : "false");
|
|
}
|
|
// generate file list from path
|
|
prop.putHTML("outbound_host", host);
|
|
if (authorized) prop.putHTML("outbound_admin_host", host); //used for WebStructurePicture_p link
|
|
prop.putHTML("inbound_host", host);
|
|
String hosthash = ASCII.String(uri.hash(), 6, 6);
|
|
String[] pathparts = uri.getPaths();
|
|
|
|
// get all files for a specific host from the index
|
|
StringBuilder q = new StringBuilder();
|
|
if (host == null) {
|
|
if (path.startsWith("file://")) {
|
|
q.append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":file");
|
|
}
|
|
} else {
|
|
q.append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(host).append("\"");
|
|
}
|
|
if (pathparts.length > 0 && pathparts[0].length() > 0) {
|
|
for (String pe: pathparts) {
|
|
if (pe.length() > 0) q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(":\"").append(pe).append('\"');
|
|
}
|
|
} else {
|
|
if (facetcount > 1000 || post.containsKey("nepr")) {
|
|
q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
|
|
}
|
|
}
|
|
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1, false,
|
|
CollectionSchema.id.getSolrFieldName(),
|
|
CollectionSchema.sku.getSolrFieldName(),
|
|
CollectionSchema.failreason_s.getSolrFieldName(),
|
|
CollectionSchema.failtype_s.getSolrFieldName(),
|
|
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
|
|
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
|
|
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
|
|
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
|
|
CollectionSchema.crawldepth_i.getSolrFieldName(),
|
|
CollectionSchema.references_i.getSolrFieldName(),
|
|
CollectionSchema.references_internal_i.getSolrFieldName(),
|
|
CollectionSchema.references_external_i.getSolrFieldName(),
|
|
CollectionSchema.references_exthosts_i.getSolrFieldName(),
|
|
CollectionSchema.cr_host_chance_d.getSolrFieldName(),
|
|
CollectionSchema.cr_host_norm_i.getSolrFieldName()
|
|
);
|
|
SolrDocument doc;
|
|
Set<String> storedDocs = new HashSet<String>();
|
|
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
|
|
Set<String> inboundLinks = new HashSet<String>();
|
|
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
|
|
Map<String, InfoCacheEntry> infoCache = new HashMap<String, InfoCacheEntry>();
|
|
int hostsize = 0;
|
|
final List<String> deleteIDs = new ArrayList<String>();
|
|
final Collection<String> reloadURLs = new ArrayList<String>();
|
|
final Set<String> reloadURLCollection = new HashSet<String>();
|
|
long timeoutList = System.currentTimeMillis() + TIMEOUT;
|
|
long timeoutReferences = System.currentTimeMillis() + 6000;
|
|
ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
|
|
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
|
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
|
|
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
|
|
FailType error = errortype == null ? null : FailType.valueOf(errortype);
|
|
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
|
|
infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences));
|
|
if (u.startsWith(path)) {
|
|
if (delete) {
|
|
deleteIDs.add(ids);
|
|
} else {
|
|
if (error == null) storedDocs.add(u); else {
|
|
if (reload404 && error == FailType.fail) {
|
|
ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
|
|
if (collections != null) reloadURLCollection.addAll(collections);
|
|
reloadURLs.add(u);
|
|
}
|
|
if (authorized) errorDocs.put(u, error);
|
|
}
|
|
}
|
|
} else if (complete) {
|
|
if (error == null) storedDocs.add(u); else {
|
|
if (authorized) errorDocs.put(u, error);
|
|
}
|
|
}
|
|
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
|
|
if (error == null) {
|
|
hostsize++;
|
|
// collect inboundlinks to browse the host
|
|
Iterator<String> links = URIMetadataNode.getLinks(doc, true);
|
|
while (links.hasNext()) {
|
|
u = links.next();
|
|
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u);
|
|
}
|
|
|
|
// collect referrer links
|
|
links = URIMetadataNode.getLinks(doc, false);
|
|
while (links.hasNext()) {
|
|
u = links.next();
|
|
try {
|
|
MultiProtocolURL mu = new MultiProtocolURL(u);
|
|
if (mu.getHost() != null) {
|
|
ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost());
|
|
if (lks == null) {
|
|
lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
|
|
outboundHosts.put(mu.getHost(), lks);
|
|
}
|
|
lks.set(u, u.length());
|
|
}
|
|
} catch (final MalformedURLException e) {}
|
|
}
|
|
}
|
|
if (System.currentTimeMillis() > timeoutList) break;
|
|
}
|
|
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
|
|
if (reloadURLs.size() > 0) {
|
|
final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
|
|
for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern);
|
|
sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false);
|
|
}
|
|
|
|
// collect from crawler
|
|
List<Request> domainStackReferences = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList<Request>(0);
|
|
Set<String> loadingLinks = new HashSet<String>();
|
|
for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true));
|
|
|
|
// now combine all lists into one
|
|
Map<String, StoreType> files = new HashMap<String, StoreType>();
|
|
for (String u: storedDocs) files.put(u, StoreType.INDEX);
|
|
for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
|
|
for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK);
|
|
for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
|
|
ConcurrentLog.info("HostBrowser", "collected " + files.size() + " urls for path " + path);
|
|
|
|
// distinguish files and folders
|
|
Map<String, Object> list = new TreeMap<String, Object>(); // a directory list; if object is boolean, its a file; if its a int[], then its a folder
|
|
int pl = path.length();
|
|
String file;
|
|
for (Map.Entry<String, StoreType> entry: files.entrySet()) {
|
|
if (entry.getKey().length() < pl) continue; // this is not inside the path
|
|
if (!entry.getKey().startsWith(path)) continue;
|
|
file = entry.getKey().substring(pl);
|
|
StoreType type = entry.getValue();
|
|
p = file.indexOf('/');
|
|
if (p < 0) {
|
|
// this is a file
|
|
list.put(entry.getKey(), type); // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
|
|
} else {
|
|
// this is a directory path or a file in a subdirectory
|
|
String remainingPath = file.substring(0, p + 1);
|
|
if (complete && remainingPath.indexOf('.') > 0) {
|
|
list.put(entry.getKey(), type); // StoreType value: this is a file
|
|
} else {
|
|
String dir = path + remainingPath;
|
|
Object c = list.get(dir);
|
|
if (c == null) {
|
|
int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0};
|
|
if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++;
|
|
if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++;
|
|
if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++;
|
|
if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
|
|
list.put(dir, linkedStoredIncrawlerError);
|
|
} else if (c instanceof int[]) {
|
|
if (type == StoreType.LINK) ((int[]) c)[0]++;
|
|
if (type == StoreType.INDEX) ((int[]) c)[1]++;
|
|
if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++;
|
|
if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int maxcount = 1000;
|
|
int c = 0;
|
|
// first list only folders
|
|
int filecounter = 0;
|
|
for (Map.Entry<String, Object> entry: list.entrySet()) {
|
|
if ((entry.getValue() instanceof StoreType)) {
|
|
filecounter++;
|
|
} else {
|
|
// this is a folder
|
|
prop.put("files_list_" + c + "_type", 1);
|
|
prop.putHTML("files_list_" + c + "_type_url", entry.getKey());
|
|
prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false");
|
|
int linked = ((int[]) entry.getValue())[0];
|
|
int stored = ((int[]) entry.getValue())[1];
|
|
int crawler = ((int[]) entry.getValue())[2];
|
|
int excl = ((int[]) entry.getValue())[3];
|
|
int error = ((int[]) entry.getValue())[4];
|
|
prop.put("files_list_" + c + "_type_stored", stored);
|
|
prop.put("files_list_" + c + "_type_linked", linked);
|
|
prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0);
|
|
prop.put("files_list_" + c + "_type_pending", crawler);
|
|
prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0);
|
|
prop.put("files_list_" + c + "_type_excluded", excl);
|
|
prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0);
|
|
prop.put("files_list_" + c + "_type_failed", error);
|
|
if (++c >= maxcount) break;
|
|
}
|
|
}
|
|
// then list files
|
|
for (Map.Entry<String, Object> entry: list.entrySet()) {
|
|
if (entry.getValue() instanceof StoreType) {
|
|
// this is a file
|
|
prop.put("files_list_" + c + "_type", 0);
|
|
prop.putHTML("files_list_" + c + "_type_url", entry.getKey());
|
|
prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false");
|
|
StoreType type = (StoreType) entry.getValue();
|
|
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
|
|
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors
|
|
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
|
|
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
|
|
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
|
|
if (!dc) {
|
|
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
|
|
if (type == StoreType.INDEX) {
|
|
String ids = ASCII.String(uri.hash());
|
|
InfoCacheEntry ice = infoCache.get(ids);
|
|
prop.put("files_list_" + c + "_type_stored_comment", ice == null ? "" : ice.toString()); // ice.toString() contains html, therefore do not use putHTML here
|
|
}
|
|
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
|
|
if (error) {
|
|
FailType failType = errorDocs.get(entry.getKey());
|
|
if (failType == null) {
|
|
// maybe this is only in the errorURL
|
|
//Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash()));
|
|
prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error");
|
|
} else {
|
|
String ids = ASCII.String(uri.hash());
|
|
InfoCacheEntry ice = infoCache.get(ids);
|
|
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString()));
|
|
}
|
|
}
|
|
if (loadRight) {
|
|
prop.putHTML("files_list_" + c + "_type_stored_load_url", entry.getKey());
|
|
prop.putHTML("files_list_" + c + "_type_stored_load_path", path);
|
|
}
|
|
if (++c >= maxcount) break;
|
|
}
|
|
}
|
|
}
|
|
prop.put("files_list", c);
|
|
prop.putHTML("files_path", path);
|
|
prop.put("files_hostsize", hostsize);
|
|
prop.put("files_subpathloadsize", storedDocs.size());
|
|
prop.put("files_subpathdetectedsize", filecounter - storedDocs.size());
|
|
prop.put("files", 1);
|
|
uri = new DigestURL(path);
|
|
if (post.containsKey("showlinkstructure")) {
|
|
sb.setConfig(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true);
|
|
}
|
|
prop.put("files_linkgraph", uri.getPath().length() <= 1 && hostsize > 0 && sb.getConfigBool(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true));
|
|
prop.put("files_linkgraph_host", uri.getHost());
|
|
|
|
// generate inbound-links table
|
|
StructureEntry struct = sb.webStructure.incomingReferences(hosthash);
|
|
if (struct != null && struct.references.size() > 0) {
|
|
maxcount = 200;
|
|
ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
|
|
for (Map.Entry<String, Integer> entry: struct.references.entrySet()) score.set(entry.getKey(), entry.getValue());
|
|
c = 0;
|
|
Iterator<String> i = score.keys(false);
|
|
while (i.hasNext() && c < maxcount) {
|
|
host = i.next();
|
|
prop.put("inbound_list_" + c + "_admin", admin ? "true" : "false");
|
|
prop.putHTML("inbound_list_" + c + "_host", sb.webStructure.hostHash2hostName(host));
|
|
prop.put("inbound_list_" + c + "_count", score.get(host));
|
|
c++;
|
|
}
|
|
prop.put("inbound_list", c);
|
|
prop.put("inbound", 1);
|
|
} else {
|
|
prop.put("inbound", 0);
|
|
}
|
|
|
|
// generate outbound-links table
|
|
if (outboundHosts.size() > 0) {
|
|
maxcount = 200;
|
|
ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
|
|
for (Map.Entry<String, ReversibleScoreMap<String>> entry: outboundHosts.entrySet()) score.set(entry.getKey(), entry.getValue().size());
|
|
c = 0;
|
|
Iterator<String> i = score.keys(false);
|
|
while (i.hasNext() && c < maxcount) {
|
|
host = i.next();
|
|
prop.putHTML("outbound_list_" + c + "_host", host);
|
|
prop.put("outbound_list_" + c + "_count", score.get(host));
|
|
prop.put("outbound_list_" + c + "_link", outboundHosts.get(host).getMinKey());
|
|
prop.put("outbound_list_" + c + "_admin", admin ? "true" : "false");
|
|
c++;
|
|
}
|
|
prop.put("outbound_list", c);
|
|
prop.put("outbound", 1);
|
|
} else {
|
|
prop.put("outbound", 0);
|
|
}
|
|
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
|
|
// return rewrite properties
|
|
prop.putNum("ucount", fulltext.collectionSize());
|
|
return prop;
|
|
}
|
|
|
|
public static final class InfoCacheEntry {
|
|
public Integer cr_n;
|
|
public Double cr_c;
|
|
public int crawldepth, references, references_internal, references_external, references_exthosts;
|
|
public List<String> references_internal_urls, references_external_urls;
|
|
public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
|
|
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
|
|
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
|
|
Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
|
|
Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
|
|
Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
|
|
Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
|
|
Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
|
|
this.crawldepth = (cr == null || cr.intValue() < 0) ? 0 : cr.intValue(); // for lazy value storage; non-existent means: stored as '0'
|
|
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
|
|
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
|
|
// calculate the url reference list
|
|
this.references_internal_urls = new ArrayList<String>();
|
|
this.references_external_urls = new ArrayList<String>();
|
|
if (fetchReferences) {
|
|
// get the references from the citation index
|
|
try {
|
|
ReferenceReport rr = rrCache.getReferenceReport(urlhash, false);
|
|
List<String> internalIDs = new ArrayList<String>();
|
|
List<String> externalIDs = new ArrayList<String>();
|
|
HandleSet iids = rr.getInternallIDs();
|
|
for (byte[] b: iids) internalIDs.add(ASCII.String(b));
|
|
HandleSet eids = rr.getExternalIDs();
|
|
for (byte[] b: eids) externalIDs.add(ASCII.String(b));
|
|
// get all urls from the index and store them here
|
|
for (String id: internalIDs) {
|
|
if (id.equals(urlhash)) continue; // no self-references
|
|
DigestURL u = fulltext.getURL(id);
|
|
if (u != null) references_internal_urls.add(u.toNormalform(true));
|
|
}
|
|
for (String id: externalIDs) {
|
|
if (id.equals(urlhash)) continue; // no self-references
|
|
DigestURL u = fulltext.getURL(id);
|
|
if (u != null) references_external_urls.add(u.toNormalform(true));
|
|
}
|
|
} catch (final IOException e) {
|
|
}
|
|
}
|
|
this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue();
|
|
this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue();
|
|
}
|
|
@Override
|
|
public String toString() {
|
|
StringBuilder sbi = new StringBuilder();
|
|
int c = 0;
|
|
for (String s: references_internal_urls) {
|
|
sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
|
c++;
|
|
if (c % 80 == 0) sbi.append("<br/>");
|
|
}
|
|
if (sbi.length() > 0) sbi.insert(0, "<br/>internal referrer:</br>");
|
|
StringBuilder sbe = new StringBuilder();
|
|
c = 0;
|
|
for (String s: references_external_urls) {
|
|
sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>");
|
|
c++;
|
|
if (c % 80 == 0) sbe.append("<br/>");
|
|
}
|
|
if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
|
|
return
|
|
(this.crawldepth == 998 ? "unknown crawldepth" : this.crawldepth >= 0 ? "crawldepth: " + this.crawldepth : "") +
|
|
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
|
|
(this.cr_n != null ? ", crn=" + this.cr_n : "") +
|
|
(this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : "");
|
|
}
|
|
}
|
|
|
|
}
|