set anchor rel attribute of all links to "nofollow" if the html meta

contains a robots:nofollow or if the http header contains a
"X-Robots-Tag: nofollow"
This commit is contained in:
Michael Peter Christen 2013-09-16 16:14:56 +02:00
parent 57e00baf26
commit 31920385f7
2 changed files with 19 additions and 2 deletions

View File

@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true));
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING);
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
String rel = tagopts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tagopts.put("rel", rel);
}
tagopts.put("text", new String(text));
tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tagopts);
@ -765,6 +770,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s.indexOf("noindex",0) >= 0) return true;
return false;
}
public boolean followDenied() {
final String s = this.metas.get("robots");
if (s == null) return false;
if (s.indexOf("nofollow",0) >= 0) return true;
return false;
}
public List<String> getDescriptions() {
String s = this.metas.get("description");

View File

@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final IndexCell<CitationReference> citations) {
boolean allAttr = this.isEmpty();
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
for (final AnchorURL target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
final String rel = target_url.getRelProperty(); // the rel-attribute
String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization
StringBuilder idi = new StringBuilder(8);