added new schema fields:

hreflang_url_sxt and hreflang_cc_sxt
for
http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077

navigation_url_sxt and navigation_type_sxt
for
http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html

publisher_url_s
for http://support.google.com/plus/answer/1713826?hl=de

all fields are disabled by default and not written to the index.
This commit is contained in:
Michael Peter Christen 2013-04-18 17:21:17 +02:00
parent 566d6c980c
commit 50421171c3
4 changed files with 79 additions and 1 deletions

View File

@ -307,6 +307,21 @@ underline_txt
## number of attr_iframes, int
#iframesscount_i
## url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
#hreflang_url_sxt
## country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
#hreflang_cc_sxt
## page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
#navigation_url_sxt
## page navigation rel property value, can contain one of {top,up,next,prev,first,last}
#navigation_type_sxt
## publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
#publisher_url_s
## the protocol of the url
url_protocol_s

View File

@ -128,6 +128,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<DigestURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<DigestURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private final Map<String, DigestURI> hreflang, navigation;
private LinkedHashSet<String> titles;
//private String headline;
private List<String>[] headlines;
@ -136,7 +137,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
private DigestURI canonical;
private DigestURI canonical, publisher;
private final int maxLinks;
private int breadcrumbs;
@ -173,6 +174,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.frames = new SizeLimitedSet<DigestURI>(maxLinks);
this.iframes = new SizeLimitedSet<DigestURI>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.hreflang = new SizeLimitedMap<String, DigestURI>(maxLinks);
this.navigation = new SizeLimitedMap<String, DigestURI>(maxLinks);
this.script = new SizeLimitedSet<DigestURI>(maxLinks);
this.titles = new LinkedHashSet<String>();
this.headlines = new ArrayList[6];
@ -187,6 +190,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.lat = 0.0d;
this.evaluationScores.match(Element.url, root.toNormalform(true));
this.canonical = null;
this.publisher = null;
this.breadcrumbs = 0;
}
@ -398,6 +402,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING);
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
@ -407,8 +412,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
mergeAnchors(newLink, tagopts);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("publisher")) {
this.publisher = newLink;
} else if (rel.equalsIgnoreCase("top") || rel.equalsIgnoreCase("up") || rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev") || rel.equalsIgnoreCase("first") || rel.equalsIgnoreCase("last")) {
this.navigation.put(rel, newLink);
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
this.rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("alternate") && hreflang.length() > 0) {
this.hreflang.put(hreflang, newLink);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
this.css.put(newLink, rel);
this.evaluationScores.match(Element.csspath, href);
@ -708,6 +719,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.canonical;
}
public DigestURI getPublisherLink() {
return this.publisher;
}
public Map<String, DigestURI> getHreflang() {
return this.hreflang;
}
public Map<String, DigestURI> getNavigation() {
return this.navigation;
}
/**
* get all images
* @return a map of <urlhash, ImageEntry>

View File

@ -713,6 +713,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// response time
add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
// hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) {
final String[] ccs = new String[html.getHreflang().size()];
final String[] urls = new String[html.getHreflang().size()];
c = 0;
for (Map.Entry<String, DigestURI> e: html.getHreflang().entrySet()) {
ccs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.hreflang_cc_sxt, ccs);
add(doc, CollectionSchema.hreflang_url_sxt, urls);
}
// page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) {
final String[] navs = new String[html.getNavigation().size()];
final String[] urls = new String[html.getNavigation().size()];
c = 0;
for (Map.Entry<String, DigestURI> e: html.getNavigation().entrySet()) {
navs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.navigation_type_sxt, navs);
add(doc, CollectionSchema.navigation_url_sxt, urls);
}
// publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) {
add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
}
}
// statistics about the links

View File

@ -142,6 +142,12 @@ public enum CollectionSchema implements SchemaDeclaration {
iframes_sxt(SolrType.string, true, true, true, false, false, "list of all links to iframes"),
iframesscount_i(SolrType.num_integer, true, true, false, false, false, "number of iframes_txt"),
hreflang_url_sxt(SolrType.string, true, true, true, false, false, "url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
hreflang_cc_sxt(SolrType.string, true, true, true, false, false, "country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
navigation_url_sxt(SolrType.string, true, true, true, false, false, "page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"),
navigation_type_sxt(SolrType.string, true, true, true, false, false, "page navigation rel property value, can contain one of {top,up,next,prev,first,last}"),
publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),