mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fix for processing of noindex flag in http header
This commit is contained in:
parent
b0d941626f
commit
fb3dd56b02
|
@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
|
|||
if (x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
|
||||
}
|
||||
return x_robots_tag;
|
||||
return x_robots_tag.toLowerCase();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ public class Document {
|
|||
private MultiProtocolURL favicon;
|
||||
private boolean resorted;
|
||||
private final Set<String> languages;
|
||||
private final boolean indexingDenied;
|
||||
private boolean indexingDenied;
|
||||
private final double lon, lat;
|
||||
private final Object parserObject; // the source object that was used to create the Document
|
||||
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
|
||||
|
@ -733,6 +733,10 @@ dc_rights
|
|||
return this.indexingDenied;
|
||||
}
|
||||
|
||||
public void setIndexingDenied(boolean indexingDenied) {
|
||||
this.indexingDenied = indexingDenied;
|
||||
}
|
||||
|
||||
public void setDepth(int depth) {
|
||||
this.crawldepth = depth;
|
||||
}
|
||||
|
@ -819,6 +823,7 @@ dc_rights
|
|||
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
|
||||
final Set<String> languages = new HashSet<String>();
|
||||
double lon = 0.0d, lat = 0.0d;
|
||||
boolean indexingDenied = false;
|
||||
Date date = new Date();
|
||||
String charset = null;
|
||||
|
||||
|
@ -867,6 +872,8 @@ dc_rights
|
|||
|
||||
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
|
||||
if (doc.dc_language() != null) languages.add(doc.dc_language());
|
||||
|
||||
indexingDenied |= doc.indexingDenied;
|
||||
}
|
||||
|
||||
// clean up parser data
|
||||
|
@ -898,7 +905,7 @@ dc_rights
|
|||
anchors,
|
||||
rss,
|
||||
images,
|
||||
false,
|
||||
indexingDenied,
|
||||
date);
|
||||
newDoc.setDepth(mindepth);
|
||||
return newDoc;
|
||||
|
|
|
@ -355,7 +355,14 @@ public final class LoaderDispatcher {
|
|||
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
|
||||
|
||||
// parse resource
|
||||
return response.parse();
|
||||
Document[] documents = response.parse();
|
||||
|
||||
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
|
||||
if (x_robots_tag.indexOf("noindex",0) >= 0) {
|
||||
for (Document d: documents) d.setIndexingDenied(true);
|
||||
}
|
||||
|
||||
return documents;
|
||||
}
|
||||
|
||||
public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
|
@ -371,7 +378,12 @@ public final class LoaderDispatcher {
|
|||
// parse resource
|
||||
try {
|
||||
Document[] documents = response.parse();
|
||||
return Document.mergeDocuments(location, response.getMimeType(), documents);
|
||||
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
|
||||
|
||||
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
|
||||
if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
|
||||
|
||||
return merged;
|
||||
} catch(final Parser.Failure e) {
|
||||
throw new IOException(e.getMessage());
|
||||
}
|
||||
|
|
|
@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
|
||||
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
|
||||
}
|
||||
String x_robots_tag = "";
|
||||
if (responseHeader != null) {
|
||||
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
|
||||
if (x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
|
||||
}
|
||||
}
|
||||
String x_robots_tag = responseHeader.getXRobotsTag();
|
||||
if (!x_robots_tag.isEmpty()) {
|
||||
x_robots_tag = x_robots_tag.toLowerCase();
|
||||
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
|
||||
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
|
||||
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9
|
||||
|
|
Loading…
Reference in New Issue
Block a user