mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added a new field for the regular expression in crawl start
- added the field in crawl profile - adopted logging end error management - adopted duplicate document detection - added a new rule to the indexing process to reject non-matching content - full redesign of the expert crawl start servlet The new filter field can now be seen in /CrawlStartExpert_p.html at Section "Document Filter", subsection item "Filter on Content of Document"
This commit is contained in:
parent
c091000165
commit
25499eead5
|
@ -64,17 +64,19 @@ public class CrawlProfileEditor_p {
|
|||
|
||||
private static final List <eentry> labels = new ArrayList<eentry>();
|
||||
static {
|
||||
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing URL Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing URL Must-Not-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTMATCH, "Indexing Content Must-Match Filter", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTNOTMATCH, "Indexing Content Must-Not-Match Filter",false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
|
||||
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
|
||||
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
|
||||
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
|
||||
|
|
|
@ -36,119 +36,159 @@
|
|||
</p>
|
||||
|
||||
<form id="Crawler" action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<table border="0" cellpadding="5" cellspacing="1">
|
||||
<tr class="TableHeader">
|
||||
<td><strong>Attribute</strong></td>
|
||||
<td><strong>Value</strong></td>
|
||||
<td><strong>Description</strong></td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellSummary">
|
||||
<td>Starting Point:</td>
|
||||
<td>
|
||||
<table cellpadding="0" cellspacing="0">
|
||||
<tr>
|
||||
<td width="160"><label for="url">One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</label>:</td>
|
||||
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
|
||||
<td>
|
||||
<textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
|
||||
|
||||
<span id="robotsOK"></span>
|
||||
<span id="title"><br/></span>
|
||||
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td></td>
|
||||
<td>
|
||||
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
|
||||
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
|
||||
<td>
|
||||
<div id="sitelistURLs"></div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
|
||||
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
|
||||
<td>
|
||||
<input name="sitemapURL" type="text" size="48" maxlength="256" value="" readonly="readonly"/>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><label for="file"><span class="nobr">From File (enter a path<br/>within your local file system)</span></label>:</td>
|
||||
<td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
|
||||
<td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td colspan="3">
|
||||
Define the start-url(s) here. You can submit more than one URL, each line one URL please.
|
||||
<fieldset>
|
||||
<legend>
|
||||
<label>Crawl Job</label>
|
||||
</legend>
|
||||
<p>A Crawl Job consist of one or more start point, crawl limitations and document freshness rules.</p>
|
||||
<fieldset>
|
||||
<legend><label>Start Point</label></legend>
|
||||
<dl>
|
||||
<dt>One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">Define the start-url(s) here. You can submit more than one URL, each line one URL please.
|
||||
Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
|
||||
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td>Crawling Depth:</td>
|
||||
<td>
|
||||
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />
|
||||
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
|
||||
Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
|
||||
</td>
|
||||
<td>
|
||||
</span></span>
|
||||
<input type="radio" align="top" name="crawlingMode" id="url" value="url" checked="checked" />
|
||||
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
|
||||
|
||||
<span id="robotsOK"></span>
|
||||
<span id="title"><br/></span>
|
||||
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
|
||||
</dd>
|
||||
<dt></dt>
|
||||
<dd>
|
||||
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
|
||||
</dd>
|
||||
<dt>From Link-List of URL</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><br />
|
||||
<div id="sitelistURLs"></div>
|
||||
</dd>
|
||||
<dt>From Sitemap</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/><input name="sitemapURL" type="text" size="71" maxlength="256" value="" readonly="readonly"/>
|
||||
</dd>
|
||||
<dt>From File (enter a path<br/>within your local file system)</dt>
|
||||
<dd>
|
||||
<input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><input type="text" name="crawlingFile" size="71" maxlength="256" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>-->
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Crawler Filter</label></legend>
|
||||
<p>These are limitations on the crawl stacker. The filters will be applied before a web page is loaded.</p>
|
||||
<dl>
|
||||
<dt>Crawling Depth</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
This defines how often the Crawler will follow links (of links..) embedded in websites.
|
||||
0 means that only the page you enter under "Starting Point" will be added
|
||||
to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will
|
||||
index approximately 25.600.000.000 pages, maybe this is the whole WWW.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td><label for="mustmatch">Must-Match Filter</label>:</td>
|
||||
<td>
|
||||
<table border="0">
|
||||
<tr><td width="160">on URLs for Crawling:<br/>
|
||||
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
|
||||
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
|
||||
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
|
||||
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
|
||||
<tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
|
||||
<tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>
|
||||
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
|
||||
that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
|
||||
Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'.
|
||||
</span></span>
|
||||
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />
|
||||
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
|
||||
</dd>
|
||||
<dt>Unlimited crawl depth for URLs matching with</dt>
|
||||
<dd>
|
||||
<input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
|
||||
</dd>
|
||||
|
||||
<dt>Maximum Pages per Domain</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
You can limit the maximum number of pages that are fetched and indexed from a single domain with this option.
|
||||
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
|
||||
the given depth. Domains outside the given depth are then sorted-out anyway.
|
||||
</span></span>
|
||||
<label for="crawlingDomMaxCheck">Use</label>:
|
||||
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />
|
||||
<label for="crawlingDomMaxPages">Page-Count</label>:
|
||||
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
|
||||
</dd>
|
||||
|
||||
<dt><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label></dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled.
|
||||
However, there are sometimes web pages with static content that
|
||||
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
|
||||
</span></span>
|
||||
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />
|
||||
</dd>
|
||||
<dt>Load Filter on URLs</dt>
|
||||
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>.
|
||||
Example: to allow only urls that contain the word 'science', set the must-match filter to '.*science.*'.
|
||||
You can also use an automatic domain-restriction to fully crawl a single domain.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
|
||||
<td>
|
||||
</span></span>
|
||||
<table border="0">
|
||||
<tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
|
||||
<tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
|
||||
<tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
|
||||
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td></td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)</td></tr>
|
||||
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)</td></tr>
|
||||
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
|
||||
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
|
||||
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>
|
||||
</dd>
|
||||
<dt>Load Filter on IPs</dt>
|
||||
<dd>
|
||||
<table border="0">
|
||||
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
|
||||
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
|
||||
</table>
|
||||
</dd>
|
||||
<dt><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>
|
||||
</dt>
|
||||
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
Crawls can be restricted to specific countries. This uses the country code that can be computed from
|
||||
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
|
||||
</span></span>
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction<br />
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter
|
||||
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Document Filter</label></legend>
|
||||
<p>These are limitations on index feeder. The filters will be applied after a web page was loaded.</p>
|
||||
<dl>
|
||||
<dt>Filter on URLs</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
|
||||
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td>Document Deletion</td>
|
||||
<td>
|
||||
<dl>
|
||||
<dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
|
||||
<dd>Do not delete any document before the crawl is started.</dd>
|
||||
<dt>Delete sub-path<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
|
||||
<dd>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
|
||||
<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
|
||||
<dd>Treat documents that are loaded
|
||||
</span></span>
|
||||
<table border="0">
|
||||
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
|
||||
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
|
||||
</table>
|
||||
</dd>
|
||||
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
|
||||
<dd>
|
||||
<table border="0">
|
||||
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="indexcontentmustmatch" id="indexcontentmustmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustmatch]#" /></td></tr>
|
||||
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Clean-Up before Crawl Start</label></legend>
|
||||
<dl>
|
||||
<dt>No Deletion</dt>
|
||||
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
|
||||
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
|
||||
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
|
||||
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/>Do not delete any document before the crawl is started.</dd>
|
||||
<dt>Delete sub-path</dt>
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
|
||||
<dt>Delete only old</dt>
|
||||
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/>Treat documents that are loaded
|
||||
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
|
||||
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
|
||||
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
|
||||
|
@ -164,22 +204,19 @@
|
|||
<option value="hour">hours</option>
|
||||
</select> ago as stale and delete them before the crawl is started.
|
||||
</dd>
|
||||
</dl>
|
||||
</td>
|
||||
<td>
|
||||
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
|
||||
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
|
||||
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td>Document Double-Check</td>
|
||||
<td>
|
||||
<dl>
|
||||
<dt>No Doubles<input type="radio" name="recrawl" value="nodoubles" checked="checked"/></dt>
|
||||
<dd>Never load any page that is already known.<br/>Only the start-url may be loaded again.</dd>
|
||||
<dt>Re-load<input type="radio" name="recrawl" value="reload"/></dt>
|
||||
<dd>Treat documents that are loaded
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Double-Check Rules</label></legend>
|
||||
<dl>
|
||||
<dt>No Doubles</dt>
|
||||
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
|
||||
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
|
||||
to use that check the 're-load' option.
|
||||
</span></span><input type="radio" name="recrawl" value="nodoubles" checked="checked"/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
|
||||
<dt>Re-load</dt>
|
||||
<dd><input type="radio" name="recrawl" value="reload"/>Treat documents that are loaded
|
||||
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
|
||||
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
|
||||
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
|
||||
|
@ -195,87 +232,58 @@
|
|||
<option value="hour">hours</option>
|
||||
</select> ago as stale and load them again. If they are younger, they are ignored.
|
||||
</dd>
|
||||
</dl>
|
||||
</td>
|
||||
<td>
|
||||
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
|
||||
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
|
||||
to use that check the 're-load' option.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
|
||||
<td>
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter
|
||||
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" /><br />
|
||||
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
|
||||
</td>
|
||||
<td>
|
||||
Crawls can be restricted to specific countries. This uses the country code that can be computed from
|
||||
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td>Maximum Pages per Domain:</td>
|
||||
<td>
|
||||
<label for="crawlingDomMaxCheck">Use</label>:
|
||||
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />
|
||||
<label for="crawlingDomMaxPages">Page-Count</label>:
|
||||
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
|
||||
</td>
|
||||
<td>
|
||||
You can limit the maximum number of pages that are fetched and indexed from a single domain with this option.
|
||||
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
|
||||
the given depth. Domains outside the given depth are then sorted-out anyway.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label>:</td>
|
||||
<td><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
|
||||
<td>
|
||||
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that
|
||||
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td><label for="storeHTCache">Store to Web Cache</label>:</td>
|
||||
<td><input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
|
||||
<td>
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Document Cache</label></legend>
|
||||
<dl><dt><label for="storeHTCache">Store to Web Cache</label></dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
|
||||
<td>
|
||||
<input type="radio" name="cachePolicy" value="nocache" />no cache
|
||||
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if fresh
|
||||
<input type="radio" name="cachePolicy" value="ifexist" />if exist
|
||||
<input type="radio" name="cachePolicy" value="cacheonly" />cache only
|
||||
</td>
|
||||
<td>
|
||||
</span></span>
|
||||
<input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# />
|
||||
</dd>
|
||||
|
||||
<dt><label for="mustmatch">Policy for usage of Web Cache</label></dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
The caching policy states when to use the cache during crawling:
|
||||
<b>no cache</b>: never use the cache, all content from fresh internet source;
|
||||
<b>if fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
|
||||
<b>if exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
|
||||
<b>cache only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td>Do Local Indexing:</td>
|
||||
<td>
|
||||
</span></span>
|
||||
<input type="radio" name="cachePolicy" value="nocache" />no cache
|
||||
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if fresh
|
||||
<input type="radio" name="cachePolicy" value="ifexist" />if exist
|
||||
<input type="radio" name="cachePolicy" value="cacheonly" />cache only
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend><label>Index Administration</label></legend>
|
||||
<dl>
|
||||
<dt>Do Local Indexing</dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
|
||||
Document Cache without indexing.
|
||||
</span></span>
|
||||
<label for="indexText">index text</label>:
|
||||
<input type="checkbox" name="indexText" id="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />
|
||||
<label for="indexMedia">index media</label>:
|
||||
<input type="checkbox" name="indexMedia" id="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# />
|
||||
</td>
|
||||
<td>
|
||||
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
|
||||
Document Cache without indexing.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td><label for="crawlOrder">Do Remote Indexing</label>:</td>
|
||||
<td>
|
||||
</dd>
|
||||
|
||||
<dt><label for="crawlOrder">Do Remote Indexing</label></dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
|
||||
If you need your crawling results locally, you should switch this off.
|
||||
Only senior and principal peers can initiate or receive remote crawls.
|
||||
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
|
||||
so they can omit starting a crawl with the same start point.
|
||||
</span></span>
|
||||
<table border="0" cellpadding="2" cellspacing="0">
|
||||
<tr>
|
||||
<td>
|
||||
|
@ -288,28 +296,23 @@
|
|||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>
|
||||
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
|
||||
If you need your crawling results locally, you should switch this off.
|
||||
Only senior and principal peers can initiate or receive remote crawls.
|
||||
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
|
||||
so they can omit starting a crawl with the same start point.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td><label for="collection">Add Crawl result to collection(s)</label>:</td>
|
||||
<td>
|
||||
</dd>
|
||||
|
||||
<dt><label for="collection">Add Crawl result to collection(s)</label></dt>
|
||||
<dd>
|
||||
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
||||
A crawl result can be tagged with names which are candidates for a collection request.
|
||||
These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator.
|
||||
To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
|
||||
</span></span>
|
||||
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
|
||||
</td>
|
||||
<td>
|
||||
A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellSummary">
|
||||
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td>
|
||||
</tr>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
|
||||
<dt><input type="submit" name="crawlingstart" value="Start New Crawl Job" class="submitready"/></dt><dd></dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
|
|
|
@ -49,6 +49,8 @@ public class CrawlStartExpert_p {
|
|||
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
|
||||
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
|
||||
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
|
||||
|
|
|
@ -212,6 +212,8 @@ public class Crawler_p {
|
|||
String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
|
||||
final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
|
||||
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
|
||||
env.setConfig("crawlOrder", crawlOrder);
|
||||
|
@ -352,6 +354,8 @@ public class Crawler_p {
|
|||
crawlerNoDepthLimitMatch,
|
||||
indexUrlMustMatch,
|
||||
indexUrlMustNotMatch,
|
||||
indexContentMustMatch,
|
||||
indexContentMustNotMatch,
|
||||
newcrawlingdepth,
|
||||
directDocByURL,
|
||||
crawlingIfOlder,
|
||||
|
|
|
@ -135,10 +135,12 @@ public class QuickCrawlLink_p {
|
|||
crawlingMustNotMatch, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
CrawlingDepth,
|
||||
true,
|
||||
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
|
||||
|
|
4
htroot/env/base.css
vendored
4
htroot/env/base.css
vendored
|
@ -97,7 +97,7 @@ td {
|
|||
|
||||
fieldset {
|
||||
margin:10px 5px;
|
||||
padding:10px;
|
||||
padding:2px 10px 2px 10px;
|
||||
}
|
||||
|
||||
legend {
|
||||
|
@ -1009,7 +1009,7 @@ div#info:hover span {
|
|||
padding: 3px;
|
||||
color: #000000;
|
||||
background: #DDDDDD;
|
||||
text-align: center;
|
||||
text-align: left;
|
||||
border: 1px dashed black;
|
||||
z-index: 100;
|
||||
}
|
|
@ -239,10 +239,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
|
||||
|
@ -265,10 +267,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
-1,
|
||||
|
@ -291,10 +295,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
|
||||
|
@ -317,10 +323,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
|
||||
|
@ -344,10 +352,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
|
||||
|
@ -370,10 +380,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
|
||||
|
@ -396,10 +408,12 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
"", //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
|
||||
|
|
|
@ -76,11 +76,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
|
||||
public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch";
|
||||
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
|
||||
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
|
||||
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
|
||||
|
||||
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
|
||||
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
|
||||
private Pattern crawlernodepthlimitmatch = null;
|
||||
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
|
||||
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
|
||||
|
||||
private final Map<String, AtomicInteger> doms;
|
||||
|
||||
|
@ -96,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
* @param crawlerNoDepthLimitMatch if matches, no depth limit is applied to the crawler
|
||||
* @param indexUrlMustMatch URLs which do not match this regex will be ignored for indexing
|
||||
* @param indexUrlMustNotMatch URLs which match this regex will be ignored for indexing
|
||||
* @param indexContentMustMatch content which do not match this regex will be ignored for indexing
|
||||
* @param indexContentMustNotMatch content which match this regex will be ignored for indexing
|
||||
* @param depth height of the tree which will be created by the crawler
|
||||
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
|
||||
* @param recrawlIfOlder documents which have been indexed in the past will
|
||||
|
@ -118,6 +123,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
final String crawlerIpMustMatch, final String crawlerIpMustNotMatch,
|
||||
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
|
||||
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
|
||||
final String indexContentMustMatch, final String indexContentMustNotMatch,
|
||||
final int depth,
|
||||
final boolean directDocByURL,
|
||||
final long recrawlIfOlder /*date*/,
|
||||
|
@ -146,6 +152,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
|
||||
put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
|
||||
put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
|
||||
put(INDEXING_CONTENT_MUSTMATCH, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
|
||||
put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
|
||||
put(DEPTH, depth);
|
||||
put(DIRECT_DOC_BY_URL, directDocByURL);
|
||||
put(RECRAWL_IF_OLDER, recrawlIfOlder);
|
||||
|
@ -277,7 +285,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.crawlerurlmustmatch == null) {
|
||||
final String r = get(CRAWLER_URL_MUSTMATCH);
|
||||
try {
|
||||
this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
|
||||
this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.crawlerurlmustmatch;
|
||||
|
@ -291,7 +299,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.crawlerurlmustnotmatch == null) {
|
||||
final String r = get(CRAWLER_URL_MUSTNOTMATCH);
|
||||
try {
|
||||
this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
|
||||
this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.crawlerurlmustnotmatch;
|
||||
|
@ -305,7 +313,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.crawleripmustmatch == null) {
|
||||
final String r = get(CRAWLER_IP_MUSTMATCH);
|
||||
try {
|
||||
this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
|
||||
this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.crawleripmustmatch;
|
||||
|
@ -319,7 +327,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.crawleripmustnotmatch == null) {
|
||||
final String r = get(CRAWLER_IP_MUSTNOTMATCH);
|
||||
try {
|
||||
this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
|
||||
this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.crawleripmustnotmatch;
|
||||
|
@ -346,7 +354,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.crawlernodepthlimitmatch == null) {
|
||||
final String r = get(CRAWLER_URL_NODEPTHLIMITMATCH);
|
||||
try {
|
||||
this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
|
||||
this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.crawlernodepthlimitmatch;
|
||||
|
@ -360,7 +368,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.indexurlmustmatch == null) {
|
||||
final String r = get(INDEXING_URL_MUSTMATCH);
|
||||
try {
|
||||
this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
|
||||
this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.indexurlmustmatch;
|
||||
|
@ -374,12 +382,40 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (this.indexurlmustnotmatch == null) {
|
||||
final String r = get(INDEXING_URL_MUSTNOTMATCH);
|
||||
try {
|
||||
this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
|
||||
this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.indexurlmustnotmatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the regex which must be matched by URLs in order to be indexed.
|
||||
* @return regex which must be matched
|
||||
*/
|
||||
public Pattern indexContentMustMatchPattern() {
|
||||
if (this.indexcontentmustmatch == null) {
|
||||
final String r = get(INDEXING_CONTENT_MUSTMATCH);
|
||||
try {
|
||||
this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.indexcontentmustmatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the regex which must not be matched by URLs in order to be indexed.
|
||||
* @return regex which must not be matched
|
||||
*/
|
||||
public Pattern indexContentMustNotMatchPattern() {
|
||||
if (this.indexcontentmustnotmatch == null) {
|
||||
final String r = get(INDEXING_CONTENT_MUSTNOTMATCH);
|
||||
try {
|
||||
this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
|
||||
} catch (PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
|
||||
}
|
||||
return this.indexcontentmustnotmatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets depth of crawl job (or height of the tree which will be
|
||||
* created by the crawler).
|
||||
|
|
|
@ -172,10 +172,12 @@ public class YMarkCrawlStart extends HashMap<String,String>{
|
|||
urlMustNotMatch,
|
||||
CrawlProfile.MATCH_ALL_STRING,
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
"",
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
CrawlProfile.MATCH_ALL_STRING,
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
CrawlProfile.MATCH_ALL_STRING,
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
CrawlProfile.MATCH_ALL_STRING,
|
||||
CrawlProfile.MATCH_NEVER_STRING,
|
||||
depth,
|
||||
medialink,
|
||||
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
|
||||
|
|
|
@ -343,20 +343,19 @@ dc_rights
|
|||
|
||||
public String getTextString() {
|
||||
try {
|
||||
if (this.text == null) return "";
|
||||
if (this.text instanceof String) {
|
||||
return (String) this.text;
|
||||
if (this.text == null) {
|
||||
this.text = "";
|
||||
} else if (this.text instanceof InputStream) {
|
||||
return UTF8.String(FileUtils.read((InputStream) this.text));
|
||||
this.text = UTF8.String(FileUtils.read((InputStream) this.text));
|
||||
} else if (this.text instanceof File) {
|
||||
return UTF8.String(FileUtils.read((File) this.text));
|
||||
this.text = UTF8.String(FileUtils.read((File) this.text));
|
||||
} else if (this.text instanceof byte[]) {
|
||||
return UTF8.String((byte[]) this.text);
|
||||
this.text = UTF8.String((byte[]) this.text);
|
||||
} else if (this.text instanceof ByteArrayOutputStream) {
|
||||
return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
|
||||
this.text = UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
|
||||
}
|
||||
assert false : this.text.getClass().toString();
|
||||
return null;
|
||||
assert this.text instanceof String : this.text.getClass().toString();
|
||||
return (String) this.text;
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
|
|
|
@ -2555,17 +2555,24 @@ public final class Switchboard extends serverSwitch {
|
|||
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
|
||||
profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
|
||||
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
|
||||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
|
||||
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
|
||||
// check which files may take part in the indexing process
|
||||
final List<Document> doclist = new ArrayList<Document>();
|
||||
for ( final Document document : in.documents ) {
|
||||
if ( document.indexingDenied() ) {
|
||||
if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
|
||||
docloop: for (final Document document : in.documents) {
|
||||
if (document.indexingDenied()) {
|
||||
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
in.queueEntry.referrerHash(),
|
||||
|
@ -2573,7 +2580,19 @@ public final class Switchboard extends serverSwitch {
|
|||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"denied by document-attached noindexing rule");
|
||||
continue;
|
||||
continue docloop;
|
||||
}
|
||||
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
|
||||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
|
||||
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
continue docloop;
|
||||
}
|
||||
doclist.add(document);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user