mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
set anchor rel attribute of all links to "nofollow" if the html meta
contains a robots:nofollow or if the http header contains a "X-Robots-Tag: nofollow"
This commit is contained in:
parent
57e00baf26
commit
31920385f7
|
@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
if (newLink != null) {
|
||||
tagopts.put("href", newLink.toNormalform(true));
|
||||
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
|
||||
String rel = tagopts.getProperty("rel", EMPTY_STRING);
|
||||
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
|
||||
final String type = tagopts.getProperty("type", EMPTY_STRING);
|
||||
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
|
||||
|
@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
|
||||
this.images.add(ie);
|
||||
} else {
|
||||
if (followDenied()) {
|
||||
String rel = tagopts.getProperty("rel", EMPTY_STRING);
|
||||
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
|
||||
tagopts.put("rel", rel);
|
||||
}
|
||||
tagopts.put("text", new String(text));
|
||||
tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
|
||||
url.setAll(tagopts);
|
||||
|
@ -765,6 +770,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (s.indexOf("noindex",0) >= 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean followDenied() {
|
||||
final String s = this.metas.get("robots");
|
||||
if (s == null) return false;
|
||||
if (s.indexOf("nofollow",0) >= 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<String> getDescriptions() {
|
||||
String s = this.metas.get("description");
|
||||
|
|
|
@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
final IndexCell<CitationReference> citations) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
int target_order = 0;
|
||||
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
|
||||
for (final AnchorURL target_url: links) {
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
|
||||
final String name = target_url.getNameProperty(); // the name attribute
|
||||
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
|
||||
final String rel = target_url.getRelProperty(); // the rel-attribute
|
||||
String rel = target_url.getRelProperty(); // the rel-attribute
|
||||
int ioidx = inbound ? 0 : 1;
|
||||
if (generalNofollow) {
|
||||
// patch the rel attribute since the header makes nofollow valid for all links
|
||||
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
|
||||
}
|
||||
|
||||
// index organization
|
||||
StringBuilder idi = new StringBuilder(8);
|
||||
|
|
Loading…
Reference in New Issue
Block a user