added new schema fields:

hreflang_url_sxt and hreflang_cc_sxt for http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 navigation_url_sxt and navigation_type_sxt for http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html publisher_url_s for http://support.google.com/plus/answer/1713826?hl=de all fields are disabled by default and not written to the index.
2024-09-19 00:01:41 +02:00 · 2013-04-18 17:21:17 +02:00 · 2013-04-18 17:21:17 +02:00 · 50421171c3
commit 50421171c3
parent 566d6c980c
4 changed files with 79 additions and 1 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -307,6 +307,21 @@ underline_txt
 ## number of attr_iframes, int
 #iframesscount_i

+## url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
+#hreflang_url_sxt
+
+## country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
+#hreflang_cc_sxt
+
+## page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
+#navigation_url_sxt
+
+## page navigation rel property value, can contain one of {top,up,next,prev,first,last}
+#navigation_type_sxt
+
+## publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
+#publisher_url_s
+
 ## the protocol of the url
 url_protocol_s

--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -128,6 +128,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final Map<DigestURI, EmbedEntry> embeds; // urlhash/embed relation
    private final Map<DigestURI, ImageEntry> images; // urlhash/image relation
    private final Map<String, String> metas;
+    private final Map<String, DigestURI> hreflang, navigation;
    private LinkedHashSet<String> titles;
    //private String headline;
    private List<String>[] headlines;
@ -136,7 +137,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final CharBuffer content;
    private final EventListenerList htmlFilterEventListeners;
    private double lon, lat;
-    private DigestURI canonical;
+    private DigestURI canonical, publisher;
    private final int maxLinks;
    private int breadcrumbs;

@ -173,6 +174,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.frames = new SizeLimitedSet<DigestURI>(maxLinks);
        this.iframes = new SizeLimitedSet<DigestURI>(maxLinks);
        this.metas = new SizeLimitedMap<String, String>(maxLinks);
+        this.hreflang = new SizeLimitedMap<String, DigestURI>(maxLinks);
+        this.navigation = new SizeLimitedMap<String, DigestURI>(maxLinks);
        this.script = new SizeLimitedSet<DigestURI>(maxLinks);
        this.titles = new LinkedHashSet<String>();
        this.headlines = new ArrayList[6];
@ -187,6 +190,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.lat = 0.0d;
        this.evaluationScores.match(Element.url, root.toNormalform(true));
        this.canonical = null;
+        this.publisher = null;
        this.breadcrumbs = 0;
    }

@ -398,6 +402,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                final String rel = tagopts.getProperty("rel", EMPTY_STRING);
                final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
                final String type = tagopts.getProperty("type", EMPTY_STRING);
+                final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);

                if (rel.equalsIgnoreCase("shortcut icon")) {
                    final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
@ -407,8 +412,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
                    mergeAnchors(newLink, tagopts);
                    this.canonical = newLink;
+                } else if (rel.equalsIgnoreCase("publisher")) {
+                    this.publisher = newLink;
+                } else if (rel.equalsIgnoreCase("top") || rel.equalsIgnoreCase("up") || rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev") || rel.equalsIgnoreCase("first") || rel.equalsIgnoreCase("last")) {
+                    this.navigation.put(rel, newLink);
                } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
                    this.rss.put(newLink, linktitle);
+                } else if (rel.equalsIgnoreCase("alternate") && hreflang.length() > 0) {
+                    this.hreflang.put(hreflang, newLink);
                } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
                    this.css.put(newLink, rel);
                    this.evaluationScores.match(Element.csspath, href);
@ -708,6 +719,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return this.canonical;
    }

+    public DigestURI getPublisherLink() {
+        return this.publisher;
+    }
+    
+    public Map<String, DigestURI> getHreflang() {
+        return this.hreflang;
+    }
+    
+    public Map<String, DigestURI> getNavigation() {
+        return this.navigation;
+    }
+
    /**
     * get all images
     * @return a map of <urlhash, ImageEntry>
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -713,6 +713,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri

            // response time
            add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
+            
+            // hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
+            if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) {
+                final String[] ccs = new String[html.getHreflang().size()];
+                final String[] urls = new String[html.getHreflang().size()];
+                c = 0;
+                for (Map.Entry<String, DigestURI> e: html.getHreflang().entrySet()) {
+                    ccs[c] = e.getKey();
+                    urls[c] = e.getValue().toNormalform(true);
+                    c++;
+                }
+                add(doc, CollectionSchema.hreflang_cc_sxt, ccs);
+                add(doc, CollectionSchema.hreflang_url_sxt, urls);
+            }
+
+            // page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
+            if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) {
+                final String[] navs = new String[html.getNavigation().size()];
+                final String[] urls = new String[html.getNavigation().size()];
+                c = 0;
+                for (Map.Entry<String, DigestURI> e: html.getNavigation().entrySet()) {
+                    navs[c] = e.getKey();
+                    urls[c] = e.getValue().toNormalform(true);
+                    c++;
+                }
+                add(doc, CollectionSchema.navigation_type_sxt, navs);
+                add(doc, CollectionSchema.navigation_url_sxt, urls);
+                
+            }
+
+            // publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
+            if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) {
+                add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
+            }
        }

        // statistics about the links
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -142,6 +142,12 @@ public enum CollectionSchema implements SchemaDeclaration {
    iframes_sxt(SolrType.string, true, true, true, false, false, "list of all links to iframes"),
    iframesscount_i(SolrType.num_integer, true, true, false, false, false, "number of iframes_txt"),

+    hreflang_url_sxt(SolrType.string, true, true, true, false, false, "url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
+    hreflang_cc_sxt(SolrType.string, true, true, true, false, false, "country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
+    navigation_url_sxt(SolrType.string, true, true, true, false, false, "page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"),
+    navigation_type_sxt(SolrType.string, true, true, true, false, false, "page navigation rel property value, can contain one of {top,up,next,prev,first,last}"),
+    publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
+    
    url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
    url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
    url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),