yacy_search_server/htroot/CrawlStartExpert.java
Michael Peter Christen 535f1ebe3b added a new way of content browsing in search results:
- date navigation

The date is taken from the CONTENT of the documents / web pages, NOT
from a date submitted in the context of metadata (i.e. http header or
html head form). This makes it possible to search for documents in the
future, i.e. when documents contain event descriptions for future
events.

The date is written to an index field which is now enabled by default.
All documents are scanned for contained date mentions.
To visualize the dates for a specific search results, a histogram
showing the number of documents for each day is displayed. To render
these histograms the morris.js library is used. Morris.js requires also
raphael.js which is now also integrated in YaCy.

The histogram is now also displayed in the index browser by default.

To select a specific range from a search result, the following modifiers
had been introduced:
from:<date>
to:<date>
These modifiers can be used separately (i.e. only 'from' or only 'to')
to describe an open interval or combined to have a closed interval. Both
dates are inclusive. To select a specific single date only, use the
'to:' - modifier.

The histogram shows blue and green lines; the green lines denot weekend
days (saturday and sunday).

Clicking on bars in the histogram has the following reaction:
1st click: add a from:<date> modifier for the date of the bar
2nd click: add a to:<date> modifier for the date of the bar
3rd click: remove from and date modifier and set a on:<date> for the bar
When the on:<date> modifier is used, the histogram shows an unlimited
time period. This makes it possible to click again (4th click) which is
then interpreted as a 1st click again (sets a from modifier).

The display feature is NOT switched on by default; to switch it on use
the /ConfigSearchPage_p.html servlet.
2015-03-02 04:30:10 +01:00

594 lines
25 KiB
Java

// CrawlStartExpert_p.java
// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2010-08-23 14:32:02 +0200 (Mo, 23 Aug 2010) $
// $LastChangedRevision: 7068 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class CrawlStartExpert {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
final String defaultCollection = "user";
// javascript values
prop.put("matchAllStr", CrawlProfile.MATCH_ALL_STRING);
prop.put("matchNoneStr", CrawlProfile.MATCH_NEVER_STRING);
prop.put("defaultCollection", defaultCollection);
// ---------- Start point
// crawl start URL
if (post != null && post.containsKey("crawlingURL")) {
final String crawlingURL = post.get("crawlingURL", "");
prop.put("starturl", crawlingURL);
// simple check for content since it may be empty
if (!crawlingURL.trim().isEmpty()) {
prop.put("has_url", 1);
}
} else {
prop.put("starturl", "");
}
// sitemap URL
if (post != null && post.containsKey("sitemapURL")) {
final String sitemapURL = post.get("sitemapURL", "");
prop.put("sitemapURL", sitemapURL);
// simple check for content since it may be empty
if (!sitemapURL.trim().isEmpty()) {
prop.put("has_sitemapURL", 1);
}
} else {
prop.put("sitemapURL", "");
}
// crawling file
if (post != null && post.containsKey("crawlingFile")) {
final String crawlingFile = post.get("crawlingFile", "");
prop.put("crawlingFile", crawlingFile);
// simple check for content since it may be empty
if (!crawlingFile.trim().isEmpty()) {
prop.put("has_crawlingFile", 1);
}
} else {
prop.put("crawlingFile", "");
}
// Crawling mode
if (post != null && post.containsKey("crawlingMode")) {
final String crawlingMode = post.get("crawlingMode", "");
boolean hasMode = false;
if (crawlingMode.equalsIgnoreCase("sitelist")
&& prop.getBoolean("has_url")) {
// sitelist needs "crawlingURL" parameter, checked already
prop.put("crawlingMode_sitelist", 1);
hasMode = true;
} else if (crawlingMode.equalsIgnoreCase("sitemap")
&& prop.getBoolean("has_sitemapURL")) {
// sitemap needs "sitemapURL" parameter, checked already
prop.put("crawlingMode_sitemap", 1);
hasMode = true;
} else if (crawlingMode.equalsIgnoreCase("file")
&& prop.getBoolean("has_crawlingFile")) {
// sitemap needs "crawlingFile" parameter, checked already
prop.put("crawlingMode_file", 1);
hasMode = true;
} else if (crawlingMode.equalsIgnoreCase("url")
&& prop.getBoolean("has_crawlingURL")) {
prop.put("crawlingMode_url", 1);
hasMode = true;
}
// try to guess mode
if (!hasMode) {
if (prop.getBoolean("has_url")) {
prop.put("crawlingMode_url", 1);
} else if (prop.getBoolean("has_sitemapURL")) {
prop.put("crawlingMode_sitemap", 1);
} else if (prop.getBoolean("has_crawlingFile")) {
prop.put("crawlingMode_file", 1);
} else {
prop.put("crawlingMode_url", 1);
}
}
} else {
// default to URL
prop.put("crawlingMode_url", 1);
}
// Bookmark title (set by script)
if (post != null && post.containsKey("bookmarkTitle")) {
prop.put("bookmarkTitle", post.get("bookmarkTitle", ""));
} else {
prop.put("bookmarkTitle", "");
}
// ---------- Crawling filter
final int crawlingDomMaxPages = env.getConfigInt(
"crawlingDomMaxPages", -1);
// crawling depth
if (post != null && post.containsKey("crawlingDepth")) {
final Integer depth = post.getInt("crawlingDepth", -1);
// depth is limited to two digits, zero allowed
if (depth >= 0 && depth < 100) {
prop.put("crawlingDepth", depth);
}
}
if (!prop.containsKey("crawlingDepth")) {
prop.put("crawlingDepth", Math.min(3,
env.getConfigLong("crawlingDepth", 0)));
}
// linked non-parseable documents?
if (post == null) {
prop.put("directDocByURLChecked",
sb.getConfigBool("crawlingDirectDocByURL", true) ? 1 : 0);
} else {
prop.put("directDocByURLChecked",
post.getBoolean("directDocByURL") ? 1 : 0);
}
// Unlimited crawl depth for URLs matching with
if (post != null && post.containsKey("crawlingDepthExtension")) {
prop.put("crawlingDepthExtension",
post.get("crawlingDepthExtension", ""));
} else {
prop.put("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
}
// Limit by maximum Pages per Domain?
if (post == null) {
prop.put("crawlingDomMaxCheck",
(crawlingDomMaxPages == -1) ? 0 : 1);
} else {
prop.put("crawlingDomMaxCheck",
post.getBoolean("crawlingDomMaxCheck") ? 1 : 0);
}
// Maximum Pages per Domain
if (post != null && post.containsKey("crawlingDomMaxPages")) {
final Integer maxPages = post.getInt("crawlingDomMaxPages", -1);
// depth is limited to six digits, zero not allowed
if (maxPages > 0 && maxPages < 1000000) {
prop.put("crawlingDomMaxPages", maxPages);
}
}
if (!prop.containsKey("crawlingDomMaxPages")) {
prop.put("crawlingDomMaxPages",
(crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
}
// Accept URLs with query-part?
// Obey html-robots-noindex, nofollow?
if (post == null) {
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", env.getConfigBool("obeyHtmlRobotsNofollow", true) ? 1 : 0);
} else {
prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {
final String range = post.get("range", "");
if (range.equalsIgnoreCase("domain")) {
prop.put("range_domain", 1);
} else if (range.equalsIgnoreCase("subpath")) {
prop.put("range_subpath", 1);
} else if (range.equalsIgnoreCase("wide")) {
prop.put("range_wide", 1);
}
} else {
prop.put("range_wide", 1);
}
// Load Filter on URLs: must match
if (post != null && post.containsKey("mustmatch")) {
prop.put("mustmatch", post.get("mustmatch", ""));
} else {
prop.put("mustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Load Filter on URLs: must-not-match
if (post != null && post.containsKey("mustnotmatch")) {
prop.put("mustnotmatch", post.get("mustnotmatch", ""));
} else {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Load Filter on IPs: must match
if (post != null && post.containsKey("ipMustmatch")) {
prop.put("ipMustmatch", post.get("ipMustmatch", ""));
} else {
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch",
CrawlProfile.MATCH_ALL_STRING));
}
// Load Filter on IPs: must-not-match
if (post != null && post.containsKey("ipMustnotmatch")) {
prop.put("ipMustnotmatch", post.get("ipMustnotmatch", ""));
} else {
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch",
CrawlProfile.MATCH_NEVER_STRING));
}
// Use Country Codes Match-List?
if (post == null) {
// use the default that was set in the original template
prop.put("countryMustMatchSwitchChecked", 0);
} else {
prop.put("countryMustMatchSwitchChecked",
post.getBoolean("countryMustMatchSwitch") ? 1 : 0);
}
// Must-Match List for Country Codes
if (post != null && post.containsKey("countryMustMatchList")) {
prop.put("countryMustMatch", post.get("countryMustMatchList", ""));
} else {
prop.put("countryMustMatch",
sb.getConfig("crawlingCountryMustMatch", ""));
}
// ---------- Document filter
// Indexer filter on URLs: must match
if (post != null && post.containsKey("indexmustmatch")) {
prop.put("indexmustmatch", post.get("indexmustmatch", ""));
} else {
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Indexer filter on URLs: must-no-match
if (post != null && post.containsKey("indexmustnotmatch")) {
prop.put("indexmustnotmatch", post.get("indexmustnotmatch", ""));
} else {
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Filter on Content of Document: must match
if (post != null && post.containsKey("indexcontentmustmatch")) {
prop.put("indexcontentmustmatch",
post.get("indexcontentmustmatch", ""));
} else {
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
}
// Filter on Content of Document: must-not-match
if (post != null && post.containsKey("indexcontentmustnotmatch")) {
prop.put("indexcontentmustnotmatch",
post.get("indexcontentmustnotmatch", ""));
} else {
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// ---------- Clean-Up before Crawl Start
// delete if older settings: number value
prop.put("deleteIfOlderSelect", 1);
for (int i=0; i<13; i++) {
prop.put("deleteIfOlderSelect_list_"+i+"_name", Integer.toString(i));
}
prop.put("deleteIfOlderSelect_list_13_name", "14");
prop.put("deleteIfOlderSelect_list_14_name", "21");
prop.put("deleteIfOlderSelect_list_15_name", "28");
prop.put("deleteIfOlderSelect_list_16_name", "30");
prop.put("deleteIfOlderSelect_list", 17);
if (post != null && post.containsKey("deleteIfOlderNumber")) {
final Integer olderNumber = post.getInt("deleteIfOlderNumber", -1);
if (olderNumber >0 && olderNumber <= 12) {
prop.put("deleteIfOlderSelect_list_" + olderNumber +
"_default", 1);
} else {
switch (olderNumber) {
case 21:
prop.put("deleteIfOlderSelect_list_14_default", 1);
break;
case 28:
prop.put("deleteIfOlderSelect_list_15_default", 1);
break;
case 30:
prop.put("deleteIfOlderSelect_list_16_default", 1);
break;
default:
prop.put("deleteIfOlderSelect_list_13_default", 1);
break;
}
}
} else {
prop.put("deleteIfOlderSelect_list_13_default", 1);
}
// delete if older settings: number unit
prop.put("deleteIfOlderUnitSelect", 1);
prop.put("deleteIfOlderUnitSelect_list_0_name", "years");
prop.put("deleteIfOlderUnitSelect_list_0_value", "year");
prop.put("deleteIfOlderUnitSelect_list_1_name", "months");
prop.put("deleteIfOlderUnitSelect_list_1_value", "month");
prop.put("deleteIfOlderUnitSelect_list_2_name", "days");
prop.put("deleteIfOlderUnitSelect_list_2_value", "day");
prop.put("deleteIfOlderUnitSelect_list_3_name", "hours");
prop.put("deleteIfOlderUnitSelect_list_3_value", "hour");
prop.put("deleteIfOlderUnitSelect_list", 4);
if (post != null && post.containsKey("deleteIfOlderUnit")) {
final String olderUnit = post.get("deleteIfOlderUnit", "");
if (olderUnit.equalsIgnoreCase("year")) {
prop.put("deleteIfOlderUnitSelect_list_0_default", 1);
} else if (olderUnit.equalsIgnoreCase("month")) {
prop.put("deleteIfOlderUnitSelect_list_1_default", 1);
} else if (olderUnit.equalsIgnoreCase("hour")) {
prop.put("deleteIfOlderUnitSelect_list_3_default", 1);
} else {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
} else {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
// delete any document before the crawl is started?
if (post != null && post.containsKey("deleteold")) {
final String deleteold = post.get("deleteold", "");
if (deleteold.equalsIgnoreCase("on")){
prop.put("deleteold_on", 1);
} else if (deleteold.equalsIgnoreCase("age")) {
prop.put("deleteold_age", 1);
} else {
prop.put("deleteold_off", 1);
}
} else {
prop.put("deleteold_off", 1);
}
// ---------- Double-Check Rules
// reload settings: number value
prop.put("reloadIfOlderSelect", 1);
for (int i=0; i<13; i++) {
prop.put("reloadIfOlderSelect_list_"+i+"_name", Integer.toString(i));
}
prop.put("reloadIfOlderSelect_list_13_name", "14");
prop.put("reloadIfOlderSelect_list_14_name", "21");
prop.put("reloadIfOlderSelect_list_15_name", "28");
prop.put("reloadIfOlderSelect_list_16_name", "30");
prop.put("reloadIfOlderSelect_list", 17);
if (post != null && post.containsKey("reloadIfOlderNumber")) {
final Integer olderNumber = post.getInt("reloadIfOlderNumber", -1);
if (olderNumber >0 && olderNumber <= 12) {
prop.put("reloadIfOlderSelect_list_" + olderNumber +
"_default", 1);
} else {
switch (olderNumber) {
case 21:
prop.put("reloadIfOlderSelect_list_14_default", 1);
break;
case 28:
prop.put("reloadIfOlderSelect_list_15_default", 1);
break;
case 30:
prop.put("reloadIfOlderSelect_list_16_default", 1);
break;
default:
prop.put("reloadIfOlderSelect_list_13_default", 1);
break;
}
}
} else {
prop.put("reloadIfOlderSelect_list_13_default", 1);
}
// reload settings: number unit
prop.put("reloadIfOlderUnitSelect", 1);
prop.put("reloadIfOlderUnitSelect_list_0_name", "years");
prop.put("reloadIfOlderUnitSelect_list_0_value", "year");
prop.put("reloadIfOlderUnitSelect_list_1_name", "months");
prop.put("reloadIfOlderUnitSelect_list_1_value", "month");
prop.put("reloadIfOlderUnitSelect_list_2_name", "days");
prop.put("reloadIfOlderUnitSelect_list_2_value", "day");
prop.put("reloadIfOlderUnitSelect_list_3_name", "hours");
prop.put("reloadIfOlderUnitSelect_list_3_value", "hour");
prop.put("reloadIfOlderUnitSelect_list", 4);
if (post != null && post.containsKey("reloadIfOlderUnit")) {
final String olderUnit = post.get("reloadIfOlderUnit", "");
if (olderUnit.equalsIgnoreCase("year")) {
prop.put("reloadIfOlderUnitSelect_list_0_default", 1);
} else if (olderUnit.equalsIgnoreCase("month")) {
prop.put("reloadIfOlderUnitSelect_list_1_default", 1);
} else if (olderUnit.equalsIgnoreCase("hour")) {
prop.put("reloadIfOlderUnitSelect_list_3_default", 1);
} else {
prop.put("reloadIfOlderUnitSelect_list_2_default", 1);
}
} else {
prop.put("reloadIfOlderUnitSelect_list_2_default", 1);
}
if (post != null && post.containsKey("recrawl")) {
final String recrawl = post.get("recrawl", "");
if (recrawl.equalsIgnoreCase("reload")) {
prop.put("recrawl_reload", 1);
} else {
prop.put("recrawl_nodoubles", 1);
}
} else {
prop.put("recrawl_nodoubles", 1);
}
// ---------- Document Cache
// Store to Web Cache?
if (post == null) {
prop.put("storeHTCacheChecked",
env.getConfigBool("storeHTCache", true) ? 1 : 0);
} else {
prop.put("storeHTCacheChecked",
post.getBoolean("storeHTCache") ? 1 : 0);
}
// Policy for usage of Web Cache
if (post != null && post.containsKey("cachePolicy")) {
final String cachePolicy = post.get("cachePolicy", "");
if (cachePolicy.equalsIgnoreCase("nocache")) {
prop.put("cachePolicy_nocache", 1);
} else if (cachePolicy.equalsIgnoreCase("ifexist")) {
prop.put("cachePolicy_ifexist", 1);
} else if (cachePolicy.equalsIgnoreCase("cacheonly")) {
prop.put("cachePolicy_cacheonly", 1);
} else {
prop.put("cachePolicy_iffresh", 1);
}
} else {
prop.put("cachePolicy_iffresh", 1);
}
// ---------- Agent name
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) {
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
}
if (sb.isGlobalMode()) {
agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
}
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) {
agentNames.add(ClientIdentification.browserAgentName);
if (ClientIdentification.getAgent(ClientIdentification.customAgentName) != null) agentNames.add(ClientIdentification.customAgentName);
}
String defaultAgentName = agentNames.get(0);
if (post != null && post.containsKey("agentName")) {
String agentName = post.get("agentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
if (agentNames.contains(agentName)) defaultAgentName = agentName;
}
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
prop.put("agentSelect_list_" + i + "_default", agentNames.get(i).equals(defaultAgentName) ? 1 : 0);
}
prop.put("agentSelect_list", agentNames.size());
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {
prop.put("vocabularySelect", 0);
} else {
prop.put("vocabularySelect", 1);
int count = 0;
for (Tagging v: vocs) {
String value = post == null ? "" : post.get("vocabulary_" + v.getName() + "_class", "");
prop.put("vocabularySelect_vocabularyset_" + count + "_name", v.getName());
prop.put("vocabularySelect_vocabularyset_" + count + "_value", value);
count++;
}
prop.put("vocabularySelect_vocabularyset", count);
}
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
boolean convertAvailable = Html2Image.convertAvailable();
prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
if (sb.getConfigBool("isTransparentProxy", false) &&
sb.getConfigBool("proxyAlwaysFresh", false) &&
wkhtmltopdfAvailable && convertAvailable) {
prop.put("snapshotEnableImages", 1);
prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
} else {
prop.put("snapshotEnableImages", 0);
}
// ---------- Index Administration
// Do Local Indexing
if (post == null) {
// Local index text?
prop.put("indexingTextChecked",
env.getConfigBool("indexText", true) ? 1 : 0);
// Local index media?
prop.put("indexingMediaChecked",
env.getConfigBool("indexMedia", true) ? 1 : 0);
// Do Remote Indexing?
if (sb.isP2PMode()) {
prop.put("remoteindexing", 1);
prop.put("remoteindexing_crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? 1 : 0);
prop.put("remoteindexing_intention", "");
} else {
prop.put("remoteindexing", 0);
}
} else {
prop.put("indexingTextChecked",
post.getBoolean("indexText") ? 1 : 0);
prop.put("indexingMediaChecked",
post.getBoolean("indexMedia") ? 1 : 0);
if (sb.isP2PMode()) {
prop.put("remoteindexing", 1);
prop.put("remoteindexing_crawlOrderChecked", post.getBoolean("crawlOrder") ? 1 : 0);
prop.put("remoteindexing_intention", post.get("intention", ""));
} else {
prop.put("remoteindexing", 0);
}
}
// Target collection
boolean collectionEnabled =
sb.index.fulltext().getDefaultConfiguration().isEmpty() ||
sb.index.fulltext().getDefaultConfiguration().contains(
CollectionSchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
if (collectionEnabled) {
if (post != null && post.containsKey("collection")) {
prop.put("collection", post.get("collection", ""));
} else {
prop.put("collection", collectionEnabled ? defaultCollection : "");
}
}
// return rewrite properties
return prop;
}
}