mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added new schema fields:
hreflang_url_sxt and hreflang_cc_sxt for http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 navigation_url_sxt and navigation_type_sxt for http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html publisher_url_s for http://support.google.com/plus/answer/1713826?hl=de all fields are disabled by default and not written to the index.
This commit is contained in:
parent
566d6c980c
commit
50421171c3
|
@ -307,6 +307,21 @@ underline_txt
|
|||
## number of attr_iframes, int
|
||||
#iframesscount_i
|
||||
|
||||
## url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
|
||||
#hreflang_url_sxt
|
||||
|
||||
## country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
|
||||
#hreflang_cc_sxt
|
||||
|
||||
## page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
|
||||
#navigation_url_sxt
|
||||
|
||||
## page navigation rel property value, can contain one of {top,up,next,prev,first,last}
|
||||
#navigation_type_sxt
|
||||
|
||||
## publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
|
||||
#publisher_url_s
|
||||
|
||||
## the protocol of the url
|
||||
url_protocol_s
|
||||
|
||||
|
|
|
@ -128,6 +128,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final Map<DigestURI, EmbedEntry> embeds; // urlhash/embed relation
|
||||
private final Map<DigestURI, ImageEntry> images; // urlhash/image relation
|
||||
private final Map<String, String> metas;
|
||||
private final Map<String, DigestURI> hreflang, navigation;
|
||||
private LinkedHashSet<String> titles;
|
||||
//private String headline;
|
||||
private List<String>[] headlines;
|
||||
|
@ -136,7 +137,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final CharBuffer content;
|
||||
private final EventListenerList htmlFilterEventListeners;
|
||||
private double lon, lat;
|
||||
private DigestURI canonical;
|
||||
private DigestURI canonical, publisher;
|
||||
private final int maxLinks;
|
||||
private int breadcrumbs;
|
||||
|
||||
|
@ -173,6 +174,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
this.frames = new SizeLimitedSet<DigestURI>(maxLinks);
|
||||
this.iframes = new SizeLimitedSet<DigestURI>(maxLinks);
|
||||
this.metas = new SizeLimitedMap<String, String>(maxLinks);
|
||||
this.hreflang = new SizeLimitedMap<String, DigestURI>(maxLinks);
|
||||
this.navigation = new SizeLimitedMap<String, DigestURI>(maxLinks);
|
||||
this.script = new SizeLimitedSet<DigestURI>(maxLinks);
|
||||
this.titles = new LinkedHashSet<String>();
|
||||
this.headlines = new ArrayList[6];
|
||||
|
@ -187,6 +190,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
this.lat = 0.0d;
|
||||
this.evaluationScores.match(Element.url, root.toNormalform(true));
|
||||
this.canonical = null;
|
||||
this.publisher = null;
|
||||
this.breadcrumbs = 0;
|
||||
}
|
||||
|
||||
|
@ -398,6 +402,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
|
||||
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
|
||||
final String type = tagopts.getProperty("type", EMPTY_STRING);
|
||||
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
|
||||
|
||||
if (rel.equalsIgnoreCase("shortcut icon")) {
|
||||
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
|
||||
|
@ -407,8 +412,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
|
||||
mergeAnchors(newLink, tagopts);
|
||||
this.canonical = newLink;
|
||||
} else if (rel.equalsIgnoreCase("publisher")) {
|
||||
this.publisher = newLink;
|
||||
} else if (rel.equalsIgnoreCase("top") || rel.equalsIgnoreCase("up") || rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev") || rel.equalsIgnoreCase("first") || rel.equalsIgnoreCase("last")) {
|
||||
this.navigation.put(rel, newLink);
|
||||
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
|
||||
this.rss.put(newLink, linktitle);
|
||||
} else if (rel.equalsIgnoreCase("alternate") && hreflang.length() > 0) {
|
||||
this.hreflang.put(hreflang, newLink);
|
||||
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
|
||||
this.css.put(newLink, rel);
|
||||
this.evaluationScores.match(Element.csspath, href);
|
||||
|
@ -708,6 +719,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
return this.canonical;
|
||||
}
|
||||
|
||||
public DigestURI getPublisherLink() {
|
||||
return this.publisher;
|
||||
}
|
||||
|
||||
public Map<String, DigestURI> getHreflang() {
|
||||
return this.hreflang;
|
||||
}
|
||||
|
||||
public Map<String, DigestURI> getNavigation() {
|
||||
return this.navigation;
|
||||
}
|
||||
|
||||
/**
|
||||
* get all images
|
||||
* @return a map of <urlhash, ImageEntry>
|
||||
|
|
|
@ -713,6 +713,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
|
||||
// response time
|
||||
add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
|
||||
|
||||
// hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
|
||||
if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) {
|
||||
final String[] ccs = new String[html.getHreflang().size()];
|
||||
final String[] urls = new String[html.getHreflang().size()];
|
||||
c = 0;
|
||||
for (Map.Entry<String, DigestURI> e: html.getHreflang().entrySet()) {
|
||||
ccs[c] = e.getKey();
|
||||
urls[c] = e.getValue().toNormalform(true);
|
||||
c++;
|
||||
}
|
||||
add(doc, CollectionSchema.hreflang_cc_sxt, ccs);
|
||||
add(doc, CollectionSchema.hreflang_url_sxt, urls);
|
||||
}
|
||||
|
||||
// page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
|
||||
if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) {
|
||||
final String[] navs = new String[html.getNavigation().size()];
|
||||
final String[] urls = new String[html.getNavigation().size()];
|
||||
c = 0;
|
||||
for (Map.Entry<String, DigestURI> e: html.getNavigation().entrySet()) {
|
||||
navs[c] = e.getKey();
|
||||
urls[c] = e.getValue().toNormalform(true);
|
||||
c++;
|
||||
}
|
||||
add(doc, CollectionSchema.navigation_type_sxt, navs);
|
||||
add(doc, CollectionSchema.navigation_url_sxt, urls);
|
||||
|
||||
}
|
||||
|
||||
// publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
|
||||
if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) {
|
||||
add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
|
||||
}
|
||||
}
|
||||
|
||||
// statistics about the links
|
||||
|
|
|
@ -142,6 +142,12 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
iframes_sxt(SolrType.string, true, true, true, false, false, "list of all links to iframes"),
|
||||
iframesscount_i(SolrType.num_integer, true, true, false, false, false, "number of iframes_txt"),
|
||||
|
||||
hreflang_url_sxt(SolrType.string, true, true, true, false, false, "url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
|
||||
hreflang_cc_sxt(SolrType.string, true, true, true, false, false, "country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077"),
|
||||
navigation_url_sxt(SolrType.string, true, true, true, false, false, "page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"),
|
||||
navigation_type_sxt(SolrType.string, true, true, true, false, false, "page navigation rel property value, can contain one of {top,up,next,prev,first,last}"),
|
||||
publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
|
||||
|
||||
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
|
||||
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
|
||||
url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),
|
||||
|
|
Loading…
Reference in New Issue
Block a user