added canonical filter

attention: this is on by default! (it should do the right thing)
2024-09-19 00:01:41 +02:00 · 2023-01-16 14:50:30 +01:00 · 2023-01-16 14:50:30 +01:00 · 9fcd8f1bda
commit 9fcd8f1bda
parent 5a52b01c09
12 changed files with 223 additions and 134 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -1,48 +1,49 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 <crawlProfiles>
 #{crawlProfiles}# 
-	<crawlProfile>
-		<handle>#[handle]#</handle>
-		<name>#[name]#</name>
-		<collections>#[collections]#</collections>
-		<agentName>#[agentName]#</agentName>
-		<userAgent>#[userAgent]#</userAgent>
-		<depth>#[depth]#</depth>
-		<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
-		<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
-		<domMaxPages>#[domMaxPages]#</domMaxPages>
-		<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
-		<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
-		<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
-		<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
-		<indexText>#(indexText)#false::true#(/indexText)#</indexText>
-		<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
-		<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
-		<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
-		<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
-		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
-		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
-		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
-		<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
-		<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
-		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
-		<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
-		<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
-		<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
-		<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
-		<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
-		<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
-		<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
-		<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
-		<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
-		<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
-		<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
-		<status>#(status)#terminated::active::system#(/status)#</status>
-		<crawlingDomFilterContent>
-		#{crawlingDomFilterContent}#
-			<item>#[item]#</item>
-		#{/crawlingDomFilterContent}#
-		</crawlingDomFilterContent>
-	</crawlProfile>
+    <crawlProfile>
+        <handle>#[handle]#</handle>
+        <name>#[name]#</name>
+        <collections>#[collections]#</collections>
+        <agentName>#[agentName]#</agentName>
+        <userAgent>#[userAgent]#</userAgent>
+        <depth>#[depth]#</depth>
+        <directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
+        <recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
+        <domMaxPages>#[domMaxPages]#</domMaxPages>
+        <crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
+        <followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
+        <obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
+        <obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
+        <indexText>#(indexText)#false::true#(/indexText)#</indexText>
+        <indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
+        <storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
+        <remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
+        <cacheStrategy>#[cacheStrategy]#</cacheStrategy>
+        <crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
+        <crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
+        <crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
+        <crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
+        <crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
+        <crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
+        <crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
+        <crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
+        <crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
+        <indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
+        <indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
+        <indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
+        <indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
+        <indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
+        <indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
+        <indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
+        <indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
+        <noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
+        <status>#(status)#terminated::active::system#(/status)#</status>
+        <crawlingDomFilterContent>
+        #{crawlingDomFilterContent}#
+            <item>#[item]#</item>
+        #{/crawlingDomFilterContent}#
+        </crawlingDomFilterContent>
+    </crawlProfile>
 #{/crawlProfiles}# 
 </crawlProfiles>
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -412,6 +412,9 @@
            <table style="border-width: 0px">
            <tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
            <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
+            <tr>
+              <td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
+            </tr>
            </table>
            </dd>
            <dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
                    <tr>
                        <td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
                        <td>
-                            <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
+                            <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
                        </td>
                    </tr>
                    #(/embeddedSolrConnected)#
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
        // check if ip is local ip address
        final String urlRejectReason = this.urlInAcceptedDomain(url);
        if (urlRejectReason != null) {
-            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
+            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
            return "denied_(" + urlRejectReason + ")";
        }

--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
                true,
                CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
                true,
                CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
                true,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                null,
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
                CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
        INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch",    false, CrawlAttribute.STRING,  "Indexing Solr Query Must-Match Filter"),
        INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING,  "Indexing Solr Query Must-Not-Match Filter"),
+        NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING,  "No Indexing for Documents with Canonical != URL"),
        RECRAWL_IF_OLDER             ("recrawlIfOlder",             false, CrawlAttribute.INTEGER, "Recrawl If Older"),
        STORE_HTCACHE                ("storeHTCache",               false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
        CACHE_STRAGEGY               ("cacheStrategy",              false, CrawlAttribute.STRING,  "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
                 final String indexUrlMustMatch, final String indexUrlMustNotMatch,
                 final String indexContentMustMatch, final String indexContentMustNotMatch,
+                 final boolean noindexWhenCanonicalUnequalURL,
                 final int depth,
                 final boolean directDocByURL,
                 final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
        put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
        put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
+        put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
    }

    /**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }

+    public boolean noindexWhenCanonicalUnequalURL() {
+        final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
+        if (r == null) return true;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
    public boolean storeHTCache() {
        final String r = get(CrawlAttribute.STORE_HTCACHE.key);
        if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -175,10 +175,10 @@ public class Response {
        int p = mime.indexOf('/');
        if (p < 0) return new String[]{mime};
        if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
-    	if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
-    	if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
-    	if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
-    	return new String[]{mime};
+        if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
+        if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
+        if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
+        return new String[]{mime};
    }

    public static final int QUEUE_STATE_FRESH             = 0;
@ -235,16 +235,16 @@ public class Response {
     * @return the original request that produced this response
     */
    public Request getRequest() {
-		return request;
-	}
+        return request;
+    }

    public ResponseHeader getResponseHeader() {
        return this.responseHeader;
    }
    
    public RequestHeader getRequestHeader() {
-		return this.requestHeader;
-	}
+        return this.requestHeader;
+    }

    public boolean fromCache() {
        return this.fromCache;
@ -260,11 +260,11 @@ public class Response {
        return this.request.name();
    }

-	/**
-	 * @return the requested URL that produced this response. When redirection(s)
-	 *         occurred, this is not the initial URL, but the last redirection
-	 *         target.
-	 */
+    /**
+     * @return the requested URL that produced this response. When redirection(s)
+     *         occurred, this is not the initial URL, but the last redirection
+     *         target.
+     */
    public DigestURL url() {
        return this.request.url();
    }
@ -745,11 +745,11 @@ public class Response {
        // -ranges in request
        // we checked that in shallStoreCache

-		/*
-		 * Eventually check if a parser supports the media yype. Depending on the crawl
-		 * profile, the indexingDocumentProcessor can eventually index only URL metadata
-		 * using the generic parser for unsupported media types
-		 */
+        /*
+         * Eventually check if a parser supports the media yype. Depending on the crawl
+         * profile, the indexingDocumentProcessor can eventually index only URL metadata
+         * using the generic parser for unsupported media types
+         */
        if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
            final String mimeType = this.responseHeader.getContentType();
            final String parserError = TextParser.supportsMime(mimeType);
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -91,12 +91,12 @@ public class Document {
    /** links to icons that belongs to the document (mapped by absolute URL) */
    private Map<DigestURL, IconEntry> icons;
    
-	/**
-	 * URLs of linked data item types/classes referenced by the document (for example in
-	 * HTML with standard annotations such as RDFa, microdata, microformats or
-	 * JSON-LD)
-	 */
-	private Set<DigestURL> linkedDataTypes;
+    /**
+     * URLs of linked data item types/classes referenced by the document (for example in
+     * HTML with standard annotations such as RDFa, microdata, microformats or
+     * JSON-LD)
+     */
+    private Set<DigestURL> linkedDataTypes;
    private boolean resorted;
    private final Set<String> languages;
    private boolean indexingDenied;
@ -131,13 +131,13 @@ public class Document {
        this.parserObject = parserObject;
        this.keywords = new LinkedHashSet<String>();
        if (keywords != null) {
-        	Collections.addAll(this.keywords, keywords);
+           Collections.addAll(this.keywords, keywords);
        }
        this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
        this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
        this.sections =  new LinkedList<String>() ;
        if (sections != null) {
-        	Collections.addAll(this.sections, sections);
+           Collections.addAll(this.sections, sections);
        }
        this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
        if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
@ -216,13 +216,21 @@ public class Document {
        }
        this.scraperObject = scraper;
    }
+    
+    public AnchorURL getCanonical() {
+        final Object scraper = this.getScraperObject();
+        if (!(scraper instanceof ContentScraper)) return null;
+        final ContentScraper html = (ContentScraper) scraper;
+        AnchorURL canonical = html.getCanonical();
+        return canonical;
+    }

    public Set<String> getContentLanguages() {
        return this.languages;
    }

    public String getFileName() {
-    	return this.source.getFileName();
+       return this.source.getFileName();
    }

    public Map<String, Set<String>> getGenericFacets() {
@ -233,15 +241,15 @@ public class Document {
     * @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
     */
    public boolean isPartiallyParsed() {
-		return this.partiallyParsed;
-	}
+        return this.partiallyParsed;
+    }
    
    /**
     * @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
     */
    public void setPartiallyParsed(final boolean partiallyParsed) {
-		this.partiallyParsed = partiallyParsed;
-	}
+        this.partiallyParsed = partiallyParsed;
+    }
    
    /**
     * compute a set of languages that this document contains
@ -637,13 +645,13 @@ dc_rights
            // we add artificial hyperlinks to the hyperlink set
            // that can be calculated from given hyperlinks and imagelinks
            
-			/*
-			 * Should we also include icons ? with
-			 * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
-			 * problematic as allReflinks will modify icons set set, removing those whose URL is
-			 * starting with "/www" but it is not desired for icons such as
-			 * www.wikipedia.org/static/favicon/wikipedia.ico
-			 */
+            /*
+             * Should we also include icons ? with
+             * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
+             * problematic as allReflinks will modify icons set set, removing those whose URL is
+             * starting with "/www" but it is not desired for icons such as
+             * www.wikipedia.org/static/favicon/wikipedia.ico
+             */

            this.hyperlinks.putAll(allReflinks(this.images.values()));
            this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
@ -804,16 +812,16 @@ dc_rights
            }
            InputStream textStream = doc.getTextStream();
            try {
-            	FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
+               FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
            } finally {
-            	try {
-                	if(textStream != null) {
-                		/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
-                		textStream.close();
-                	}
-            	} catch(IOException e) {
-            		ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
-            	}
+               try {
+                   if(textStream != null) {
+                       /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
+                       textStream.close();
+                   }
+               } catch(IOException e) {
+                   ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
+               }
            }

            this.anchors.addAll(doc.getAnchors());
@ -826,41 +834,41 @@ dc_rights
     * @return links to icons that belongs to the document (mapped by absolute URL)
     */
    public Map<DigestURL, IconEntry> getIcons() {
-		return icons;
-	}
+        return icons;
+    }
    
    /**
     * Set links to icons that belongs to the document (mapped by absolute URL)
     * @param icons
     */
    public void setIcons(final Map<DigestURL, IconEntry> icons) {
-    	/* Better to ensure now icons property will not be null */
-    	if(icons != null) {
-    		this.icons = icons;	
-    	} else {
-    		this.icons = new HashMap<>();
-    	}
-	}
+       /* Better to ensure now icons property will not be null */
+       if(icons != null) {
+           this.icons = icons;    
+       } else {
+           this.icons = new HashMap<>();
+       }
+    }
    
-	/**
-	 * @return URLs of linked data item types/classes referenced by the document (for example in
-	 * HTML with standard annotations such as RDFa, microdata, microformats or
-	 * JSON-LD)
-	 */
+    /**
+     * @return URLs of linked data item types/classes referenced by the document (for example in
+     * HTML with standard annotations such as RDFa, microdata, microformats or
+     * JSON-LD)
+     */
    public Set<DigestURL> getLinkedDataTypes() {
-		return this.linkedDataTypes;
-	}
+        return this.linkedDataTypes;
+    }
    
-	/**
-	 * @return URLs of linked data item types/classes referenced by the document
-	 */
+    /**
+     * @return URLs of linked data item types/classes referenced by the document
+     */
    public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) {
-    	if(linkedDataTypes != null) {
-    		/* Ensure non null property */
-    		this.linkedDataTypes = linkedDataTypes;
-    	} else {
-    		this.linkedDataTypes.clear();
-    	}
+       if(linkedDataTypes != null) {
+           /* Ensure non null property */
+           this.linkedDataTypes = linkedDataTypes;
+       } else {
+           this.linkedDataTypes.clear();
+       }
    }
    

@ -1034,14 +1042,14 @@ dc_rights
                } catch (final IOException e) {
                    ConcurrentLog.logException(e);
                } finally {
-                	try {
-                    	if(textStream != null) {
-                    		/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
-                    		textStream.close();
-                    	}
-					} catch (IOException e) {
-						ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
-					}
+                   try {
+                       if(textStream != null) {
+                           /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
+                           textStream.close();
+                       }
+                    } catch (IOException e) {
+                        ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
+                    }
                }
            }
            anchors.addAll(doc.getAnchors());
@ -1098,7 +1106,7 @@ dc_rights
    public final static String IFRAME_MARKER = "iframe";
    public final static String FRAME_MARKER = "frame";
    public final static String EMBED_MARKER = "embed";
-    
+
    public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
        final Map<AnchorURL, String> result = new HashMap<>();
        for (final Document d: documents) {
--- a/source/net/yacy/htroot/CrawlStartExpert.java
+++ b/source/net/yacy/htroot/CrawlStartExpert.java
@ -369,6 +369,13 @@ public class CrawlStartExpert {
            }
        }

+        // Check Canonical?
+        if (post == null) {
+            prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
+        } else {
+            prop.put("noindexWhenCanonicalUnequalURLChecked",
+                    post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
+        }

        // ---------- Clean-Up before Crawl Start
        // delete if older settings: number value
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -316,6 +316,7 @@ public class Crawler_p {
                final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
                final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
                final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+                final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));

                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
                            indexUrlMustNotMatch,
                            indexContentMustMatch,
                            indexContentMustNotMatch,
+                            noindexWhenCanonicalUnequalURL,
                            newcrawlingdepth,
                            directDocByURL,
                            crawlingIfOlder,
--- a/source/net/yacy/htroot/QuickCrawlLink_p.java
+++ b/source/net/yacy/htroot/QuickCrawlLink_p.java
@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
                        CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                        CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                        CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                        false,
                        CrawlingDepth,
                        true,
                        CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
                return new IndexingQueueEntry(in.queueEntry, in.documents, null);
            }
        }
-        if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
-                (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
-            if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
+
+        // check mustmatch pattern
+        Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
+        if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
+            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
+            if (this.log.isInfo()) this.log.info(info);
            // create a new errorURL DB entry
-            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
+            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
+        }
+
+        // check mustnotmatch
+        Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
+        if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
+            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
+            if (this.log.isInfo()) this.log.info(info);
+            // create a new errorURL DB entry
+            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
        }

        // check which files may take part in the indexing process
        final List<Document> doclist = new ArrayList<>();
        docloop: for (final Document document : in.documents) {
+
+            // check canonical
+            if (profile.noindexWhenCanonicalUnequalURL()) {
+                AnchorURL canonical = document.getCanonical();
+                DigestURL source = document.dc_source();
+                if (canonical != null && source != null) {
+                    String canonical_norm = canonical.toNormalform(true);
+                    String source_norm = source.toNormalform(true);
+                    if (!canonical_norm.equals(source_norm)) {
+                        String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
+                        if (this.log.isInfo()) this.log.info(info);
+                        // create a new errorURL DB entry
+                        this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+                        continue docloop;
+                    }
+                }
+            }
+
+            // check indexing denied flags
            if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
                if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
                // create a new errorURL DB entry
                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
                continue docloop;
            }
-            if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
-                    (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
-                if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
+
+            // check content pattern must-match
+            Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
+            if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
+                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
+                if (this.log.isInfo()) this.log.info(info);
                // create a new errorURL DB entry
-                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
+                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+                continue docloop;
+            }
+
+            // check content pattern must-not-match
+            Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
+            if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
+                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
+                if (this.log.isInfo()) this.log.info(info);
+                // create a new errorURL DB entry
+                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
                continue docloop;
            }