mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
handle noarchive tag, skip writing page to cache
http://mantis.tokeek.de/view.php?id=44
This commit is contained in:
parent
fe917deb2d
commit
fb1fcc2b03
|
@ -201,6 +201,7 @@ public final class Cache {
|
|||
|
||||
public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
|
||||
if (maxCacheSize == 0) return;
|
||||
if (responseHeader.getXRobotsTag().contains("noarchive")) return; // don't cache, see http://noarchive.net/
|
||||
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
|
||||
if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
|
||||
log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");
|
||||
|
|
|
@ -557,6 +557,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// bit 2: "follow" contained in html header meta
|
||||
// bit 3: "noindex" contained in html header meta
|
||||
// bit 4: "nofollow" contained in html header meta
|
||||
// bit 5: "noarchive" contained in html header meta
|
||||
// bit 8: "all" contained in http header X-Robots-Tag
|
||||
// bit 9: "noindex" contained in http header X-Robots-Tag
|
||||
// bit 10: "nofollow" contained in http header X-Robots-Tag
|
||||
|
@ -576,6 +577,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
|
||||
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
|
||||
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
|
||||
if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
|
||||
}
|
||||
String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
|
||||
if (!x_robots_tag.isEmpty()) {
|
||||
|
@ -1494,10 +1496,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
try {
|
||||
String doccountquery =
|
||||
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex)
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow)
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex)
|
||||
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow)
|
||||
"((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " +
|
||||
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
|
||||
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +
|
||||
|
|
|
@ -111,15 +111,21 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"),
|
||||
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"),
|
||||
// encoded as binary value into an integer:
|
||||
// bit 0: "all" contained in html header meta
|
||||
// bit 1: "index" contained in html header meta
|
||||
// bit 2: "noindex" contained in html header meta
|
||||
// bit 3: "nofollow" contained in html header meta
|
||||
// bit 8: "noarchive" contained in http header properties
|
||||
// bit 9: "nosnippet" contained in http header properties
|
||||
// bit 10: "noindex" contained in http header properties
|
||||
// bit 11: "nofollow" contained in http header properties
|
||||
// bit 12: "unavailable_after" contained in http header properties
|
||||
// bit 0: "all" contained in html header meta
|
||||
// bit 1: "index" contained in html header meta
|
||||
// bit 2: "follow" contained in html header meta
|
||||
// bit 3: "noindex" contained in html header meta
|
||||
// bit 4: "nofollow" contained in html header meta
|
||||
// bit 5: "noarchive" contained in html header meta
|
||||
// bit 8: "all" contained in http header X-Robots-Tag
|
||||
// bit 9: "noindex" contained in http header X-Robots-Tag
|
||||
// bit 10: "nofollow" contained in http header X-Robots-Tag
|
||||
// bit 11: "noarchive" contained in http header X-Robots-Tag
|
||||
// bit 12: "nosnippet" contained in http header X-Robots-Tag
|
||||
// bit 13: "noodp" contained in http header X-Robots-Tag
|
||||
// bit 14: "notranslate" contained in http header X-Robots-Tag
|
||||
// bit 15: "noimageindex" contained in http header X-Robots-Tag
|
||||
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
|
||||
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
|
||||
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
|
||||
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),
|
||||
|
|
Loading…
Reference in New Issue
Block a user