handle noarchive tag, skip writing page to cache

http://mantis.tokeek.de/view.php?id=44
This commit is contained in:
reger 2014-10-01 04:35:34 +02:00
parent fe917deb2d
commit fb1fcc2b03
3 changed files with 22 additions and 13 deletions

View File

@ -201,6 +201,7 @@ public final class Cache {
public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
if (maxCacheSize == 0) return;
if (responseHeader.getXRobotsTag().contains("noarchive")) return; // don't cache, see http://noarchive.net/
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");

View File

@ -557,6 +557,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// bit 2: "follow" contained in html header meta
// bit 3: "noindex" contained in html header meta
// bit 4: "nofollow" contained in html header meta
// bit 5: "noarchive" contained in html header meta
// bit 8: "all" contained in http header X-Robots-Tag
// bit 9: "noindex" contained in http header X-Robots-Tag
// bit 10: "nofollow" contained in http header X-Robots-Tag
@ -576,6 +577,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
}
String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) {
@ -1494,10 +1496,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try {
String doccountquery =
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow)
"((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " +
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +

View File

@ -111,15 +111,21 @@ public enum CollectionSchema implements SchemaDeclaration {
scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "follow" contained in html header meta
// bit 3: "noindex" contained in html header meta
// bit 4: "nofollow" contained in html header meta
// bit 5: "noarchive" contained in html header meta
// bit 8: "all" contained in http header X-Robots-Tag
// bit 9: "noindex" contained in http header X-Robots-Tag
// bit 10: "nofollow" contained in http header X-Robots-Tag
// bit 11: "noarchive" contained in http header X-Robots-Tag
// bit 12: "nosnippet" contained in http header X-Robots-Tag
// bit 13: "noodp" contained in http header X-Robots-Tag
// bit 14: "notranslate" contained in http header X-Robots-Tag
// bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),