mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added canonical filter
attention: this is on by default! (it should do the right thing)
This commit is contained in:
parent
5a52b01c09
commit
9fcd8f1bda
|
@ -1,48 +1,49 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<crawlProfiles>
|
||||
#{crawlProfiles}#
|
||||
<crawlProfile>
|
||||
<handle>#[handle]#</handle>
|
||||
<name>#[name]#</name>
|
||||
<collections>#[collections]#</collections>
|
||||
<agentName>#[agentName]#</agentName>
|
||||
<userAgent>#[userAgent]#</userAgent>
|
||||
<depth>#[depth]#</depth>
|
||||
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
|
||||
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
|
||||
<domMaxPages>#[domMaxPages]#</domMaxPages>
|
||||
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
|
||||
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
|
||||
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
|
||||
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
|
||||
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
|
||||
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
|
||||
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
|
||||
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
|
||||
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
|
||||
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
|
||||
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
|
||||
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
|
||||
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
|
||||
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
|
||||
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
|
||||
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
|
||||
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
|
||||
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
|
||||
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
|
||||
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
|
||||
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
|
||||
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
|
||||
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
|
||||
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
|
||||
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
|
||||
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
|
||||
<status>#(status)#terminated::active::system#(/status)#</status>
|
||||
<crawlingDomFilterContent>
|
||||
#{crawlingDomFilterContent}#
|
||||
<item>#[item]#</item>
|
||||
#{/crawlingDomFilterContent}#
|
||||
</crawlingDomFilterContent>
|
||||
</crawlProfile>
|
||||
<crawlProfile>
|
||||
<handle>#[handle]#</handle>
|
||||
<name>#[name]#</name>
|
||||
<collections>#[collections]#</collections>
|
||||
<agentName>#[agentName]#</agentName>
|
||||
<userAgent>#[userAgent]#</userAgent>
|
||||
<depth>#[depth]#</depth>
|
||||
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
|
||||
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
|
||||
<domMaxPages>#[domMaxPages]#</domMaxPages>
|
||||
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
|
||||
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
|
||||
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
|
||||
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
|
||||
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
|
||||
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
|
||||
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
|
||||
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
|
||||
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
|
||||
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
|
||||
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
|
||||
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
|
||||
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
|
||||
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
|
||||
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
|
||||
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
|
||||
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
|
||||
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
|
||||
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
|
||||
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
|
||||
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
|
||||
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
|
||||
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
|
||||
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
|
||||
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
|
||||
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
|
||||
<noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
|
||||
<status>#(status)#terminated::active::system#(/status)#</status>
|
||||
<crawlingDomFilterContent>
|
||||
#{crawlingDomFilterContent}#
|
||||
<item>#[item]#</item>
|
||||
#{/crawlingDomFilterContent}#
|
||||
</crawlingDomFilterContent>
|
||||
</crawlProfile>
|
||||
#{/crawlProfiles}#
|
||||
</crawlProfiles>
|
||||
|
|
|
@ -412,6 +412,9 @@
|
|||
<table style="border-width: 0px">
|
||||
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
|
||||
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
|
||||
<tr>
|
||||
<td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
|
||||
</tr>
|
||||
</table>
|
||||
</dd>
|
||||
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
|
||||
|
@ -470,7 +473,7 @@
|
|||
<tr>
|
||||
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
|
||||
<td>
|
||||
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
|
||||
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
|
||||
</td>
|
||||
</tr>
|
||||
#(/embeddedSolrConnected)#
|
||||
|
|
|
@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
|
|||
// check if ip is local ip address
|
||||
final String urlRejectReason = this.urlInAcceptedDomain(url);
|
||||
if (urlRejectReason != null) {
|
||||
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
|
||||
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
|
||||
return "denied_(" + urlRejectReason + ")";
|
||||
}
|
||||
|
||||
|
|
|
@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
|
||||
|
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
|
||||
|
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
|
||||
|
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
null,
|
||||
|
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
|
||||
|
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
|
||||
|
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
|
||||
|
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
|
||||
|
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
|
||||
|
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
|
||||
|
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0,
|
||||
false,
|
||||
null,
|
||||
|
|
|
@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|||
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
|
||||
true, //noindexWhenCanonicalUnequalURL
|
||||
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
|
||||
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
|
||||
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
|
||||
|
|
|
@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
|
||||
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
|
||||
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
|
||||
NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
|
||||
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
|
||||
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
|
||||
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
|
||||
|
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
|
||||
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
|
||||
final String indexContentMustMatch, final String indexContentMustNotMatch,
|
||||
final boolean noindexWhenCanonicalUnequalURL,
|
||||
final int depth,
|
||||
final boolean directDocByURL,
|
||||
final Date recrawlIfOlder /*date*/,
|
||||
|
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
|
||||
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
|
||||
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
|
||||
put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean noindexWhenCanonicalUnequalURL() {
|
||||
final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
|
||||
if (r == null) return true;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean storeHTCache() {
|
||||
final String r = get(CrawlAttribute.STORE_HTCACHE.key);
|
||||
if (r == null) return false;
|
||||
|
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
|
||||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
|
||||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
|
||||
prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
|
||||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
|
||||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
|
||||
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));
|
||||
|
|
|
@ -175,10 +175,10 @@ public class Response {
|
|||
int p = mime.indexOf('/');
|
||||
if (p < 0) return new String[]{mime};
|
||||
if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
|
||||
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
|
||||
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
|
||||
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
|
||||
return new String[]{mime};
|
||||
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
|
||||
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
|
||||
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
|
||||
return new String[]{mime};
|
||||
}
|
||||
|
||||
public static final int QUEUE_STATE_FRESH = 0;
|
||||
|
@ -235,16 +235,16 @@ public class Response {
|
|||
* @return the original request that produced this response
|
||||
*/
|
||||
public Request getRequest() {
|
||||
return request;
|
||||
}
|
||||
return request;
|
||||
}
|
||||
|
||||
public ResponseHeader getResponseHeader() {
|
||||
return this.responseHeader;
|
||||
}
|
||||
|
||||
public RequestHeader getRequestHeader() {
|
||||
return this.requestHeader;
|
||||
}
|
||||
return this.requestHeader;
|
||||
}
|
||||
|
||||
public boolean fromCache() {
|
||||
return this.fromCache;
|
||||
|
@ -260,11 +260,11 @@ public class Response {
|
|||
return this.request.name();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the requested URL that produced this response. When redirection(s)
|
||||
* occurred, this is not the initial URL, but the last redirection
|
||||
* target.
|
||||
*/
|
||||
/**
|
||||
* @return the requested URL that produced this response. When redirection(s)
|
||||
* occurred, this is not the initial URL, but the last redirection
|
||||
* target.
|
||||
*/
|
||||
public DigestURL url() {
|
||||
return this.request.url();
|
||||
}
|
||||
|
@ -745,11 +745,11 @@ public class Response {
|
|||
// -ranges in request
|
||||
// we checked that in shallStoreCache
|
||||
|
||||
/*
|
||||
* Eventually check if a parser supports the media yype. Depending on the crawl
|
||||
* profile, the indexingDocumentProcessor can eventually index only URL metadata
|
||||
* using the generic parser for unsupported media types
|
||||
*/
|
||||
/*
|
||||
* Eventually check if a parser supports the media yype. Depending on the crawl
|
||||
* profile, the indexingDocumentProcessor can eventually index only URL metadata
|
||||
* using the generic parser for unsupported media types
|
||||
*/
|
||||
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
|
||||
final String mimeType = this.responseHeader.getContentType();
|
||||
final String parserError = TextParser.supportsMime(mimeType);
|
||||
|
|
|
@ -91,12 +91,12 @@ public class Document {
|
|||
/** links to icons that belongs to the document (mapped by absolute URL) */
|
||||
private Map<DigestURL, IconEntry> icons;
|
||||
|
||||
/**
|
||||
* URLs of linked data item types/classes referenced by the document (for example in
|
||||
* HTML with standard annotations such as RDFa, microdata, microformats or
|
||||
* JSON-LD)
|
||||
*/
|
||||
private Set<DigestURL> linkedDataTypes;
|
||||
/**
|
||||
* URLs of linked data item types/classes referenced by the document (for example in
|
||||
* HTML with standard annotations such as RDFa, microdata, microformats or
|
||||
* JSON-LD)
|
||||
*/
|
||||
private Set<DigestURL> linkedDataTypes;
|
||||
private boolean resorted;
|
||||
private final Set<String> languages;
|
||||
private boolean indexingDenied;
|
||||
|
@ -131,13 +131,13 @@ public class Document {
|
|||
this.parserObject = parserObject;
|
||||
this.keywords = new LinkedHashSet<String>();
|
||||
if (keywords != null) {
|
||||
Collections.addAll(this.keywords, keywords);
|
||||
Collections.addAll(this.keywords, keywords);
|
||||
}
|
||||
this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
|
||||
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
|
||||
this.sections = new LinkedList<String>() ;
|
||||
if (sections != null) {
|
||||
Collections.addAll(this.sections, sections);
|
||||
Collections.addAll(this.sections, sections);
|
||||
}
|
||||
this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
|
||||
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
|
||||
|
@ -216,13 +216,21 @@ public class Document {
|
|||
}
|
||||
this.scraperObject = scraper;
|
||||
}
|
||||
|
||||
public AnchorURL getCanonical() {
|
||||
final Object scraper = this.getScraperObject();
|
||||
if (!(scraper instanceof ContentScraper)) return null;
|
||||
final ContentScraper html = (ContentScraper) scraper;
|
||||
AnchorURL canonical = html.getCanonical();
|
||||
return canonical;
|
||||
}
|
||||
|
||||
public Set<String> getContentLanguages() {
|
||||
return this.languages;
|
||||
}
|
||||
|
||||
public String getFileName() {
|
||||
return this.source.getFileName();
|
||||
return this.source.getFileName();
|
||||
}
|
||||
|
||||
public Map<String, Set<String>> getGenericFacets() {
|
||||
|
@ -233,15 +241,15 @@ public class Document {
|
|||
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
|
||||
*/
|
||||
public boolean isPartiallyParsed() {
|
||||
return this.partiallyParsed;
|
||||
}
|
||||
return this.partiallyParsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
|
||||
*/
|
||||
public void setPartiallyParsed(final boolean partiallyParsed) {
|
||||
this.partiallyParsed = partiallyParsed;
|
||||
}
|
||||
this.partiallyParsed = partiallyParsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute a set of languages that this document contains
|
||||
|
@ -637,13 +645,13 @@ dc_rights
|
|||
// we add artificial hyperlinks to the hyperlink set
|
||||
// that can be calculated from given hyperlinks and imagelinks
|
||||
|
||||
/*
|
||||
* Should we also include icons ? with
|
||||
* this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
|
||||
* problematic as allReflinks will modify icons set set, removing those whose URL is
|
||||
* starting with "/www" but it is not desired for icons such as
|
||||
* www.wikipedia.org/static/favicon/wikipedia.ico
|
||||
*/
|
||||
/*
|
||||
* Should we also include icons ? with
|
||||
* this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
|
||||
* problematic as allReflinks will modify icons set set, removing those whose URL is
|
||||
* starting with "/www" but it is not desired for icons such as
|
||||
* www.wikipedia.org/static/favicon/wikipedia.ico
|
||||
*/
|
||||
|
||||
this.hyperlinks.putAll(allReflinks(this.images.values()));
|
||||
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
|
||||
|
@ -804,16 +812,16 @@ dc_rights
|
|||
}
|
||||
InputStream textStream = doc.getTextStream();
|
||||
try {
|
||||
FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
|
||||
FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
|
||||
} finally {
|
||||
try {
|
||||
if(textStream != null) {
|
||||
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
||||
textStream.close();
|
||||
}
|
||||
} catch(IOException e) {
|
||||
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
|
||||
}
|
||||
try {
|
||||
if(textStream != null) {
|
||||
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
||||
textStream.close();
|
||||
}
|
||||
} catch(IOException e) {
|
||||
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
|
||||
}
|
||||
}
|
||||
|
||||
this.anchors.addAll(doc.getAnchors());
|
||||
|
@ -826,41 +834,41 @@ dc_rights
|
|||
* @return links to icons that belongs to the document (mapped by absolute URL)
|
||||
*/
|
||||
public Map<DigestURL, IconEntry> getIcons() {
|
||||
return icons;
|
||||
}
|
||||
return icons;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set links to icons that belongs to the document (mapped by absolute URL)
|
||||
* @param icons
|
||||
*/
|
||||
public void setIcons(final Map<DigestURL, IconEntry> icons) {
|
||||
/* Better to ensure now icons property will not be null */
|
||||
if(icons != null) {
|
||||
this.icons = icons;
|
||||
} else {
|
||||
this.icons = new HashMap<>();
|
||||
}
|
||||
}
|
||||
/* Better to ensure now icons property will not be null */
|
||||
if(icons != null) {
|
||||
this.icons = icons;
|
||||
} else {
|
||||
this.icons = new HashMap<>();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return URLs of linked data item types/classes referenced by the document (for example in
|
||||
* HTML with standard annotations such as RDFa, microdata, microformats or
|
||||
* JSON-LD)
|
||||
*/
|
||||
/**
|
||||
* @return URLs of linked data item types/classes referenced by the document (for example in
|
||||
* HTML with standard annotations such as RDFa, microdata, microformats or
|
||||
* JSON-LD)
|
||||
*/
|
||||
public Set<DigestURL> getLinkedDataTypes() {
|
||||
return this.linkedDataTypes;
|
||||
}
|
||||
return this.linkedDataTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return URLs of linked data item types/classes referenced by the document
|
||||
*/
|
||||
/**
|
||||
* @return URLs of linked data item types/classes referenced by the document
|
||||
*/
|
||||
public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) {
|
||||
if(linkedDataTypes != null) {
|
||||
/* Ensure non null property */
|
||||
this.linkedDataTypes = linkedDataTypes;
|
||||
} else {
|
||||
this.linkedDataTypes.clear();
|
||||
}
|
||||
if(linkedDataTypes != null) {
|
||||
/* Ensure non null property */
|
||||
this.linkedDataTypes = linkedDataTypes;
|
||||
} else {
|
||||
this.linkedDataTypes.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1034,14 +1042,14 @@ dc_rights
|
|||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
} finally {
|
||||
try {
|
||||
if(textStream != null) {
|
||||
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
||||
textStream.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
|
||||
}
|
||||
try {
|
||||
if(textStream != null) {
|
||||
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
||||
textStream.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
|
||||
}
|
||||
}
|
||||
}
|
||||
anchors.addAll(doc.getAnchors());
|
||||
|
@ -1098,7 +1106,7 @@ dc_rights
|
|||
public final static String IFRAME_MARKER = "iframe";
|
||||
public final static String FRAME_MARKER = "frame";
|
||||
public final static String EMBED_MARKER = "embed";
|
||||
|
||||
|
||||
public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
|
||||
final Map<AnchorURL, String> result = new HashMap<>();
|
||||
for (final Document d: documents) {
|
||||
|
|
|
@ -369,6 +369,13 @@ public class CrawlStartExpert {
|
|||
}
|
||||
}
|
||||
|
||||
// Check Canonical?
|
||||
if (post == null) {
|
||||
prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
|
||||
} else {
|
||||
prop.put("noindexWhenCanonicalUnequalURLChecked",
|
||||
post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
|
||||
}
|
||||
|
||||
// ---------- Clean-Up before Crawl Start
|
||||
// delete if older settings: number value
|
||||
|
|
|
@ -316,6 +316,7 @@ public class Crawler_p {
|
|||
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
|
||||
|
||||
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
|
||||
env.setConfig("crawlOrder", crawlOrder);
|
||||
|
@ -614,6 +615,7 @@ public class Crawler_p {
|
|||
indexUrlMustNotMatch,
|
||||
indexContentMustMatch,
|
||||
indexContentMustNotMatch,
|
||||
noindexWhenCanonicalUnequalURL,
|
||||
newcrawlingdepth,
|
||||
directDocByURL,
|
||||
crawlingIfOlder,
|
||||
|
|
|
@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
|
|||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
false,
|
||||
CrawlingDepth,
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
|
||||
|
|
|
@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
|
|||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
}
|
||||
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
|
||||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
|
||||
// check mustmatch pattern
|
||||
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
|
||||
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
|
||||
// check mustnotmatch
|
||||
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
|
||||
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
|
||||
// check which files may take part in the indexing process
|
||||
final List<Document> doclist = new ArrayList<>();
|
||||
docloop: for (final Document document : in.documents) {
|
||||
|
||||
// check canonical
|
||||
if (profile.noindexWhenCanonicalUnequalURL()) {
|
||||
AnchorURL canonical = document.getCanonical();
|
||||
DigestURL source = document.dc_source();
|
||||
if (canonical != null && source != null) {
|
||||
String canonical_norm = canonical.toNormalform(true);
|
||||
String source_norm = source.toNormalform(true);
|
||||
if (!canonical_norm.equals(source_norm)) {
|
||||
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
continue docloop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check indexing denied flags
|
||||
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
|
||||
continue docloop;
|
||||
}
|
||||
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
|
||||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
|
||||
// check content pattern must-match
|
||||
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
|
||||
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
continue docloop;
|
||||
}
|
||||
|
||||
// check content pattern must-not-match
|
||||
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
|
||||
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
|
||||
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
|
||||
if (this.log.isInfo()) this.log.info(info);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
|
||||
continue docloop;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user