fix for processing of noindex flag in http header

This commit is contained in:
Michael Peter Christen 2014-07-10 17:13:35 +02:00
parent b0d941626f
commit fb3dd56b02
4 changed files with 26 additions and 13 deletions

View File

@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
if (x_robots_tag.isEmpty()) { if (x_robots_tag.isEmpty()) {
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, ""); x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
} }
return x_robots_tag; return x_robots_tag.toLowerCase();
} }
} }

View File

@ -90,7 +90,7 @@ public class Document {
private MultiProtocolURL favicon; private MultiProtocolURL favicon;
private boolean resorted; private boolean resorted;
private final Set<String> languages; private final Set<String> languages;
private final boolean indexingDenied; private boolean indexingDenied;
private final double lon, lat; private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
@ -733,6 +733,10 @@ dc_rights
return this.indexingDenied; return this.indexingDenied;
} }
public void setIndexingDenied(boolean indexingDenied) {
this.indexingDenied = indexingDenied;
}
public void setDepth(int depth) { public void setDepth(int depth) {
this.crawldepth = depth; this.crawldepth = depth;
} }
@ -819,6 +823,7 @@ dc_rights
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>(); final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>(); final Set<String> languages = new HashSet<String>();
double lon = 0.0d, lat = 0.0d; double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false;
Date date = new Date(); Date date = new Date();
String charset = null; String charset = null;
@ -867,6 +872,8 @@ dc_rights
if (doc.getDepth() < mindepth) mindepth = doc.getDepth(); if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
if (doc.dc_language() != null) languages.add(doc.dc_language()); if (doc.dc_language() != null) languages.add(doc.dc_language());
indexingDenied |= doc.indexingDenied;
} }
// clean up parser data // clean up parser data
@ -898,7 +905,7 @@ dc_rights
anchors, anchors,
rss, rss,
images, images,
false, indexingDenied,
date); date);
newDoc.setDepth(mindepth); newDoc.setDepth(mindepth);
return newDoc; return newDoc;

View File

@ -355,7 +355,14 @@ public final class LoaderDispatcher {
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource // parse resource
return response.parse(); Document[] documents = response.parse();
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) {
for (Document d: documents) d.setIndexingDenied(true);
}
return documents;
} }
public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -371,7 +378,12 @@ public final class LoaderDispatcher {
// parse resource // parse resource
try { try {
Document[] documents = response.parse(); Document[] documents = response.parse();
return Document.mergeDocuments(location, response.getMimeType(), documents); Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
return merged;
} catch(final Parser.Failure e) { } catch(final Parser.Failure e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }

View File

@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3 if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
} }
String x_robots_tag = ""; String x_robots_tag = responseHeader.getXRobotsTag();
if (responseHeader != null) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
}
}
if (!x_robots_tag.isEmpty()) { if (!x_robots_tag.isEmpty()) {
x_robots_tag = x_robots_tag.toLowerCase();
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9