- added a new field for the regular expression in crawl start

- added the field in crawl profile
- adopted logging end error management
- adopted duplicate document detection
- added a new rule to the indexing process to reject non-matching
content
- full redesign of the expert crawl start servlet
The new filter field can now be seen in /CrawlStartExpert_p.html at
Section "Document Filter", subsection item "Filter on Content of
Document"
This commit is contained in:
Michael Peter Christen 2013-04-26 10:49:55 +02:00
parent c091000165
commit 25499eead5
11 changed files with 339 additions and 256 deletions

View File

@ -64,17 +64,19 @@ public class CrawlProfileEditor_p {
private static final List <eentry> labels = new ArrayList<eentry>();
static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing URL Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing URL Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTMATCH, "Indexing Content Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTNOTMATCH, "Indexing Content Must-Not-Match Filter",false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));

View File

@ -36,119 +36,159 @@
</p>
<form id="Crawler" action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="5" cellspacing="1">
<tr class="TableHeader">
<td><strong>Attribute</strong></td>
<td><strong>Value</strong></td>
<td><strong>Description</strong></td>
</tr>
<tr valign="top" class="TableCellSummary">
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td width="160"><label for="url">One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</td>
</tr>
<tr>
<td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
<td>
<div id="sitelistURLs"></div>
</td>
</tr>
<tr>
<td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
<td>
<input name="sitemapURL" type="text" size="48" maxlength="256" value="" readonly="readonly"/>
</td>
</tr>
<tr>
<td><label for="file"><span class="nobr">From File (enter a path<br/>within your local file system)</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
<td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
</tr>
</table>
</td>
<td colspan="3">
Define the start-url(s) here. You can submit more than one URL, each line one URL please.
<fieldset>
<legend>
<label>Crawl Job</label>
</legend>
<p>A Crawl Job consist of one or more start point, crawl limitations and document freshness rules.</p>
<fieldset>
<legend><label>Start Point</label></legend>
<dl>
<dt>One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">Define the start-url(s) here. You can submit more than one URL, each line one URL please.
Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Crawling Depth:</td>
<td>
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
</td>
<td>
</span></span>
<input type="radio" align="top" name="crawlingMode" id="url" value="url" checked="checked" />
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="/env/grafics/empty.gif" alt="empty" />
</dd>
<dt></dt>
<dd>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</dd>
<dt>From Link-List of URL</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><br />
<div id="sitelistURLs"></div>
</dd>
<dt>From Sitemap</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/><input name="sitemapURL" type="text" size="71" maxlength="256" value="" readonly="readonly"/>
</dd>
<dt>From File (enter a path<br/>within your local file system)</dt>
<dd>
<input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/><input type="text" name="crawlingFile" size="71" maxlength="256" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>-->
</dd>
</dl>
</fieldset>
<fieldset>
<legend><label>Crawler Filter</label></legend>
<p>These are limitations on the crawl stacker. The filters will be applied before a web page is loaded.</p>
<dl>
<dt>Crawling Depth</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
This defines how often the Crawler will follow links (of links..) embedded in websites.
0 means that only the page you enter under "Starting Point" will be added
to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will
index approximately 25.600.000.000 pages, maybe this is the whole WWW.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td>
<table border="0">
<tr><td width="160">on URLs for Crawling:<br/>
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
<tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
</table>
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'.
</span></span>
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
</dd>
<dt>Unlimited crawl depth for URLs matching with</dt>
<dd>
<input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
</dd>
<dt>Maximum Pages per Domain</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You can limit the maximum number of pages that are fetched and indexed from a single domain with this option.
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
the given depth. Domains outside the given depth are then sorted-out anyway.
</span></span>
<label for="crawlingDomMaxCheck">Use</label>:
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />&nbsp;&nbsp;
<label for="crawlingDomMaxPages">Page-Count</label>:
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
</dd>
<dt><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled.
However, there are sometimes web pages with static content that
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
</span></span>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>.
Example: to allow only urls that contain the word 'science', set the must-match filter to '.*science.*'.
You can also use an automatic domain-restriction to fully crawl a single domain.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td>
</span></span>
<table border="0">
<tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
<tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)</td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)</td></tr>
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
</table>
</td>
<td>
</dd>
<dt>Load Filter on IPs</dt>
<dd>
<table border="0">
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
</table>
</dd>
<dt><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>
</dt>
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</span></span>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction<br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
</dd>
</dl>
</fieldset>
<fieldset>
<legend><label>Document Filter</label></legend>
<p>These are limitations on index feeder. The filters will be applied after a web page was loaded.</p>
<dl>
<dt>Filter on URLs</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Document Deletion</td>
<td>
<dl>
<dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
<dd>Do not delete any document before the crawl is started.</dd>
<dt>Delete sub-path<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
<dd>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
<dd>Treat documents that are loaded
</span></span>
<table border="0">
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
<dd>
<table border="0">
<tr><td width="110"><img src="/env/grafics/plus.gif"> must-match</td><td><input name="indexcontentmustmatch" id="indexcontentmustmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustmatch]#" /></td></tr>
<tr><td><img src="/env/grafics/minus.gif"> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
</table>
</dd>
</dl>
</fieldset>
<fieldset>
<legend><label>Clean-Up before Crawl Start</label></legend>
<dl>
<dt>No Deletion</dt>
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/>Do not delete any document before the crawl is started.</dd>
<dt>Delete sub-path</dt>
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old</dt>
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
@ -164,22 +204,19 @@
<option value="hour">hours</option>
</select> ago as stale and delete them before the crawl is started.
</dd>
</dl>
</td>
<td>
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Document Double-Check</td>
<td>
<dl>
<dt>No&nbsp;Doubles<input type="radio" name="recrawl" value="nodoubles" checked="checked"/></dt>
<dd>Never load any page that is already known.<br/>Only the start-url may be loaded again.</dd>
<dt>Re-load<input type="radio" name="recrawl" value="reload"/></dt>
<dd>Treat documents that are loaded
</dl>
</fieldset>
<fieldset>
<legend><label>Double-Check Rules</label></legend>
<dl>
<dt>No&nbsp;Doubles</dt>
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
to use that check the 're-load' option.
</span></span><input type="radio" name="recrawl" value="nodoubles" checked="checked"/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
<dt>Re-load</dt>
<dd><input type="radio" name="recrawl" value="reload"/>Treat documents that are loaded
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
@ -195,87 +232,58 @@
<option value="hour">hours</option>
</select> ago as stale and load them again. If they are younger, they are ignored.
</dd>
</dl>
</td>
<td>
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
to use that check the 're-load' option.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" /><br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
</td>
<td>
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Maximum Pages per Domain:</td>
<td>
<label for="crawlingDomMaxCheck">Use</label>:
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />&nbsp;&nbsp;
<label for="crawlingDomMaxPages">Page-Count</label>:
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
</td>
<td>
You can limit the maximum number of pages that are fetched and indexed from a single domain with this option.
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
the given depth. Domains outside the given depth are then sorted-out anyway.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label>:</td>
<td><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
<td>
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="storeHTCache">Store to Web Cache</label>:</td>
<td><input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td>
</dl>
</fieldset>
<fieldset>
<legend><label>Document Cache</label></legend>
<dl><dt><label for="storeHTCache">Store to Web Cache</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
<td>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
</td>
<td>
</span></span>
<input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# />
</dd>
<dt><label for="mustmatch">Policy for usage of Web Cache</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The caching policy states when to use the cache during crawling:
<b>no&nbsp;cache</b>: never use the cache, all content from fresh internet source;
<b>if&nbsp;fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
<b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Do Local Indexing:</td>
<td>
</span></span>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
</dd>
</dl>
</fieldset>
<fieldset>
<legend><label>Index Administration</label></legend>
<dl>
<dt>Do Local Indexing</dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
Document Cache without indexing.
</span></span>
<label for="indexText">index text</label>:
<input type="checkbox" name="indexText" id="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />&nbsp;&nbsp;&nbsp;
<label for="indexMedia">index media</label>:
<input type="checkbox" name="indexMedia" id="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# />
</td>
<td>
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
Document Cache without indexing.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="crawlOrder">Do Remote Indexing</label>:</td>
<td>
</dd>
<dt><label for="crawlOrder">Do Remote Indexing</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
If you need your crawling results locally, you should switch this off.
Only senior and principal peers can initiate or receive remote crawls.
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
so they can omit starting a crawl with the same start point.
</span></span>
<table border="0" cellpadding="2" cellspacing="0">
<tr>
<td>
@ -288,28 +296,23 @@
</td>
</tr>
</table>
</td>
<td>
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
If you need your crawling results locally, you should switch this off.
Only senior and principal peers can initiate or receive remote crawls.
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
so they can omit starting a crawl with the same start point.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="collection">Add Crawl result to collection(s)</label>:</td>
<td>
</dd>
<dt><label for="collection">Add Crawl result to collection(s)</label></dt>
<dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
A crawl result can be tagged with names which are candidates for a collection request.
These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator.
To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
</span></span>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</td>
<td>
A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
</td>
</tr>
<tr valign="top" class="TableCellSummary">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td>
</tr>
</table>
</dd>
</dl>
</fieldset>
<dt><input type="submit" name="crawlingstart" value="Start New Crawl Job" class="submitready"/></dt><dd></dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#

View File

@ -49,6 +49,8 @@ public class CrawlStartExpert_p {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));

View File

@ -212,6 +212,8 @@ public class Crawler_p {
String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
@ -352,6 +354,8 @@ public class Crawler_p {
crawlerNoDepthLimitMatch,
indexUrlMustMatch,
indexUrlMustNotMatch,
indexContentMustMatch,
indexContentMustNotMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,

View File

@ -135,10 +135,12 @@ public class QuickCrawlLink_p {
crawlingMustNotMatch, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
CrawlingDepth,
true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month

4
htroot/env/base.css vendored
View File

@ -97,7 +97,7 @@ td {
fieldset {
margin:10px 5px;
padding:10px;
padding:2px 10px 2px 10px;
}
legend {
@ -1009,7 +1009,7 @@ div#info:hover span {
padding: 3px;
color: #000000;
background: #DDDDDD;
text-align: center;
text-align: left;
border: 1px dashed black;
z-index: 100;
}

View File

@ -239,10 +239,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -265,10 +267,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
-1,
@ -291,10 +295,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -317,10 +323,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -344,10 +352,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -370,10 +380,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -396,10 +408,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),

View File

@ -76,11 +76,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch";
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private final Map<String, AtomicInteger> doms;
@ -96,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param crawlerNoDepthLimitMatch if matches, no depth limit is applied to the crawler
* @param indexUrlMustMatch URLs which do not match this regex will be ignored for indexing
* @param indexUrlMustNotMatch URLs which match this regex will be ignored for indexing
* @param indexContentMustMatch content which do not match this regex will be ignored for indexing
* @param indexContentMustNotMatch content which match this regex will be ignored for indexing
* @param depth height of the tree which will be created by the crawler
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will
@ -118,6 +123,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String crawlerIpMustMatch, final String crawlerIpMustNotMatch,
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch,
final int depth,
final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
@ -146,6 +152,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(INDEXING_CONTENT_MUSTMATCH, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
@ -277,7 +285,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.crawlerurlmustmatch == null) {
final String r = get(CRAWLER_URL_MUSTMATCH);
try {
this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlerurlmustmatch;
@ -291,7 +299,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.crawlerurlmustnotmatch == null) {
final String r = get(CRAWLER_URL_MUSTNOTMATCH);
try {
this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlerurlmustnotmatch;
@ -305,7 +313,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.crawleripmustmatch == null) {
final String r = get(CRAWLER_IP_MUSTMATCH);
try {
this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawleripmustmatch;
@ -319,7 +327,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.crawleripmustnotmatch == null) {
final String r = get(CRAWLER_IP_MUSTNOTMATCH);
try {
this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawleripmustnotmatch;
@ -346,7 +354,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.crawlernodepthlimitmatch == null) {
final String r = get(CRAWLER_URL_NODEPTHLIMITMATCH);
try {
this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlernodepthlimitmatch;
@ -360,7 +368,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.indexurlmustmatch == null) {
final String r = get(INDEXING_URL_MUSTMATCH);
try {
this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexurlmustmatch;
@ -374,12 +382,40 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (this.indexurlmustnotmatch == null) {
final String r = get(INDEXING_URL_MUSTNOTMATCH);
try {
this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexurlmustnotmatch;
}
/**
* Gets the regex which must be matched by URLs in order to be indexed.
* @return regex which must be matched
*/
public Pattern indexContentMustMatchPattern() {
if (this.indexcontentmustmatch == null) {
final String r = get(INDEXING_CONTENT_MUSTMATCH);
try {
this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexcontentmustmatch;
}
/**
* Gets the regex which must not be matched by URLs in order to be indexed.
* @return regex which must not be matched
*/
public Pattern indexContentMustNotMatchPattern() {
if (this.indexcontentmustnotmatch == null) {
final String r = get(INDEXING_CONTENT_MUSTNOTMATCH);
try {
this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexcontentmustnotmatch;
}
/**
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).

View File

@ -172,10 +172,12 @@ public class YMarkCrawlStart extends HashMap<String,String>{
urlMustNotMatch,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"",
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
depth,
medialink,
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),

View File

@ -343,20 +343,19 @@ dc_rights
public String getTextString() {
try {
if (this.text == null) return "";
if (this.text instanceof String) {
return (String) this.text;
if (this.text == null) {
this.text = "";
} else if (this.text instanceof InputStream) {
return UTF8.String(FileUtils.read((InputStream) this.text));
this.text = UTF8.String(FileUtils.read((InputStream) this.text));
} else if (this.text instanceof File) {
return UTF8.String(FileUtils.read((File) this.text));
this.text = UTF8.String(FileUtils.read((File) this.text));
} else if (this.text instanceof byte[]) {
return UTF8.String((byte[]) this.text);
this.text = UTF8.String((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) {
return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
this.text = UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
}
assert false : this.text.getClass().toString();
return null;
assert this.text instanceof String : this.text.getClass().toString();
return (String) this.text;
} catch (final Exception e) {
Log.logException(e);
}

View File

@ -2555,17 +2555,24 @@ public final class Switchboard extends serverSwitch {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<Document>();
for ( final Document document : in.documents ) {
if ( document.indexingDenied() ) {
if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
docloop: for (final Document document : in.documents) {
if (document.indexingDenied()) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
@ -2573,7 +2580,19 @@ public final class Switchboard extends serverSwitch {
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"denied by document-attached noindexing rule");
continue;
continue docloop;
}
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
continue docloop;
}
doclist.add(document);
}