diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index b32ae0aa0..674a42be3 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -307,6 +307,21 @@ underline_txt ## number of attr_iframes, int #iframesscount_i +## url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 +#hreflang_url_sxt + +## country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 +#hreflang_cc_sxt + +## page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html +#navigation_url_sxt + +## page navigation rel property value, can contain one of {top,up,next,prev,first,last} +#navigation_type_sxt + +## publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de +#publisher_url_s + ## the protocol of the url url_protocol_s diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 55fbe8a75..854e5666d 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -128,6 +128,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Map embeds; // urlhash/embed relation private final Map images; // urlhash/image relation private final Map metas; + private final Map hreflang, navigation; private LinkedHashSet titles; //private String headline; private List[] headlines; @@ -136,7 +137,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; - private DigestURI canonical; + private DigestURI canonical, publisher; private final int maxLinks; private int breadcrumbs; @@ -173,6 +174,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.frames = new SizeLimitedSet(maxLinks); this.iframes = new SizeLimitedSet(maxLinks); this.metas = new SizeLimitedMap(maxLinks); + this.hreflang = new SizeLimitedMap(maxLinks); + this.navigation = new SizeLimitedMap(maxLinks); this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); this.headlines = new ArrayList[6]; @@ -187,6 +190,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.lat = 0.0d; this.evaluationScores.match(Element.url, root.toNormalform(true)); this.canonical = null; + this.publisher = null; this.breadcrumbs = 0; } @@ -398,6 +402,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String rel = tagopts.getProperty("rel", EMPTY_STRING); final String linktitle = tagopts.getProperty("title", EMPTY_STRING); final String type = tagopts.getProperty("type", EMPTY_STRING); + final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING); if (rel.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); @@ -407,8 +412,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); mergeAnchors(newLink, tagopts); this.canonical = newLink; + } else if (rel.equalsIgnoreCase("publisher")) { + this.publisher = newLink; + } else if (rel.equalsIgnoreCase("top") || rel.equalsIgnoreCase("up") || rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev") || rel.equalsIgnoreCase("first") || rel.equalsIgnoreCase("last")) { + this.navigation.put(rel, newLink); } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) { this.rss.put(newLink, linktitle); + } else if (rel.equalsIgnoreCase("alternate") && hreflang.length() > 0) { + this.hreflang.put(hreflang, newLink); } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) { this.css.put(newLink, rel); this.evaluationScores.match(Element.csspath, href); @@ -708,6 +719,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.canonical; } + public DigestURI getPublisherLink() { + return this.publisher; + } + + public Map getHreflang() { + return this.hreflang; + } + + public Map getNavigation() { + return this.navigation; + } + /** * get all images * @return a map of diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 84a2db16e..30c83a084 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -713,6 +713,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // response time add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); + + // hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 + if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) { + final String[] ccs = new String[html.getHreflang().size()]; + final String[] urls = new String[html.getHreflang().size()]; + c = 0; + for (Map.Entry e: html.getHreflang().entrySet()) { + ccs[c] = e.getKey(); + urls[c] = e.getValue().toNormalform(true); + c++; + } + add(doc, CollectionSchema.hreflang_cc_sxt, ccs); + add(doc, CollectionSchema.hreflang_url_sxt, urls); + } + + // page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html + if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) { + final String[] navs = new String[html.getNavigation().size()]; + final String[] urls = new String[html.getNavigation().size()]; + c = 0; + for (Map.Entry e: html.getNavigation().entrySet()) { + navs[c] = e.getKey(); + urls[c] = e.getValue().toNormalform(true); + c++; + } + add(doc, CollectionSchema.navigation_type_sxt, navs); + add(doc, CollectionSchema.navigation_url_sxt, urls); + + } + + // publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de + if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) { + add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true)); + } } // statistics about the links diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index eaae9c535..af34e59ba 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -142,6 +142,12 @@ public enum CollectionSchema implements SchemaDeclaration { iframes_sxt(SolrType.string, true, true, true, false, false, "list of all links to iframes"), iframesscount_i(SolrType.num_integer, true, true, false, false, false, "number of iframes_txt"), + hreflang_url_sxt(SolrType.string, true, true, true, false, false, "url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"), + hreflang_cc_sxt(SolrType.string, true, true, true, false, false, "country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"), + navigation_url_sxt(SolrType.string, true, true, true, false, false, "page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"), + navigation_type_sxt(SolrType.string, true, true, true, false, false, "page navigation rel property value, can contain one of {top,up,next,prev,first,last}"), + publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"), + url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"), url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"), url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),