handle noarchive tag, skip writing page to cache

http://mantis.tokeek.de/view.php?id=44
2024-09-19 00:01:41 +02:00 · 2014-10-01 04:35:34 +02:00 · 2014-10-01 04:35:34 +02:00 · fb1fcc2b03
commit fb1fcc2b03
parent fe917deb2d
3 changed files with 22 additions and 13 deletions
--- a/source/net/yacy/crawler/data/Cache.java
+++ b/source/net/yacy/crawler/data/Cache.java
@ -201,6 +201,7 @@ public final class Cache {

    public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
        if (maxCacheSize == 0) return;
+        if (responseHeader.getXRobotsTag().contains("noarchive")) return; // don't cache, see http://noarchive.net/
        if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
        if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
        log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -557,6 +557,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            // bit  2: "follow" contained in html header meta
            // bit  3: "noindex" contained in html header meta
            // bit  4: "nofollow" contained in html header meta
+            // bit  5: "noarchive" contained in html header meta
            // bit  8: "all" contained in http header X-Robots-Tag
            // bit  9: "noindex" contained in http header X-Robots-Tag
            // bit 10: "nofollow" contained in http header X-Robots-Tag
@ -576,6 +577,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
                if (robots_meta.indexOf("noindex",0) >= 0) b += 8;  // set bit 3
                if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
+                if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
            }
            String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
            if (!x_robots_tag.isEmpty()) {
@ -1494,10 +1496,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    try {
                        String doccountquery = 
                                CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
-                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
-                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
-                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
-                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex)
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow)
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex)
+                                "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow)
                                "((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " +
                                CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
                                "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -111,15 +111,21 @@ public enum CollectionSchema implements SchemaDeclaration {
    scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"),
    scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"),
    // encoded as binary value into an integer:
-    // bit  0: "all" contained in html header meta
-    // bit  1: "index" contained in html header meta
-    // bit  2: "noindex" contained in html header meta
-    // bit  3: "nofollow" contained in html header meta
-    // bit  8: "noarchive" contained in http header properties
-    // bit  9: "nosnippet" contained in http header properties
-    // bit 10: "noindex" contained in http header properties
-    // bit 11: "nofollow" contained in http header properties
-    // bit 12: "unavailable_after" contained in http header properties
+            // bit  0: "all" contained in html header meta
+            // bit  1: "index" contained in html header meta
+            // bit  2: "follow" contained in html header meta
+            // bit  3: "noindex" contained in html header meta
+            // bit  4: "nofollow" contained in html header meta
+            // bit  5: "noarchive" contained in html header meta
+            // bit  8: "all" contained in http header X-Robots-Tag
+            // bit  9: "noindex" contained in http header X-Robots-Tag
+            // bit 10: "nofollow" contained in http header X-Robots-Tag
+            // bit 11: "noarchive" contained in http header X-Robots-Tag
+            // bit 12: "nosnippet" contained in http header X-Robots-Tag
+            // bit 13: "noodp" contained in http header X-Robots-Tag
+            // bit 14: "notranslate" contained in http header X-Robots-Tag
+            // bit 15: "noimageindex" contained in http header X-Robots-Tag
+            // bit 16: "unavailable_after" contained in http header X-Robots-Tag
    robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
    metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
    inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),