diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index bb33c44ac..d75e0d98f 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -75,15 +75,21 @@ wordcount_i ## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen attr_inboundlinks -## number of inbound links, int +## total number of inbound links, int inboundlinkscount_i +## number of inbound links with noindex tag, int +inboundlinksnoindexcount_i + ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen attr_outboundlinks -## number of external links, int +## total number of external links, int outboundlinkscount_i +## number of external links with noindex tag, int +outboundlinksnoindexcount_i + ## h1 header, textgen attr_h1 diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 627093db0..a0b59d143 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -61,11 +61,13 @@ public class IndexFederated_p { sb.solrConnector = null; } + final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); + final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename)); + if (!solrWasOn && solrIsOnAfterwards) { // switch on final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; - final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list")); try { sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; } catch (final IOException e) { @@ -75,7 +77,6 @@ public class IndexFederated_p { } // read index scheme table flags - final SolrScheme scheme = sb.solrConnector.getScheme(); final Iterator i = scheme.allIterator(); ConfigurationSet.Entry entry; while (i.hasNext()) { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index ebd9aeada..3d21f00a0 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -562,7 +562,8 @@ public final class Switchboard extends serverSwitch { // prepare a solr index profile switch list final File solrBackupProfile = new File("defaults/solr.keys.list"); - final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list"); + final String schemename = getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); + final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename); if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile); final SolrScheme backupScheme = new SolrScheme(solrBackupProfile); final SolrScheme workingScheme = new SolrScheme(solrWorkProfile); diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 7d6bd5513..868d0a9d1 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -111,14 +111,14 @@ public class SolrScheme extends ConfigurationSet { addSolr(solrdoc, "keywords", yacydoc.dc_subject(' ')); final String content = UTF8.String(yacydoc.getTextBytes()); addSolr(solrdoc, "text_t", content); - if (contains("wordcount_i")) { + if (isEmpty() || contains("wordcount_i")) { final int contentwc = content.split(" ").length; addSolr(solrdoc, "wordcount_i", contentwc); } // path elements of link final String path = digestURI.getPath(); - if (path != null && contains("attr_paths")) { + if (path != null && (isEmpty() || contains("attr_paths"))) { final String[] paths = path.split("/"); if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths); } @@ -126,8 +126,9 @@ public class SolrScheme extends ConfigurationSet { // list all links final Map alllinks = yacydoc.getAnchors(); int c = 0; - addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); - if (contains("attr_inboundlinks")) { + if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); + if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount()); + if (isEmpty() || contains("attr_inboundlinks")) { final String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.inboundLinks()) { final Properties p = alllinks.get(url); @@ -135,23 +136,24 @@ public class SolrScheme extends ConfigurationSet { final String rel = p.getProperty("rel", ""); inboundlinks[c++] = " 0 ? " rel=\"" + rel + "\"" : "") + ">" + ((name.length() > 0) ? name : "") + ""; } addSolr(solrdoc, "attr_inboundlinks", inboundlinks); } c = 0; - final String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; - if (contains("attr_outboundlinks")) { - addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length); + if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount()); + if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount()); + if (isEmpty() || contains("attr_outboundlinks")) { + final String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.outboundLinks()) { final Properties p = alllinks.get(url); final String name = p.getProperty("name", ""); final String rel = p.getProperty("rel", ""); outboundlinks[c++] = " 0 ? " rel=\"" + rel + "\"" : "") + ">" + ((name.length() > 0) ? name : "") + ""; } @@ -196,7 +198,7 @@ public class SolrScheme extends ConfigurationSet { addSolr(solrdoc, "boldcount_i", bold.length); if (bold.length > 0) { addSolr(solrdoc, "attr_bold", bold); - if (contains("attr_boldcount")) { + if (isEmpty() || contains("attr_boldcount")) { addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold)); } } @@ -204,7 +206,7 @@ public class SolrScheme extends ConfigurationSet { addSolr(solrdoc, "italiccount_i", italic.length); if (italic.length > 0) { addSolr(solrdoc, "attr_italic", italic); - if (contains("attr_italiccount")) { + if (isEmpty() || contains("attr_italiccount")) { addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic)); } } @@ -213,7 +215,7 @@ public class SolrScheme extends ConfigurationSet { if (li.length > 0) addSolr(solrdoc, "attr_li", li); // images - if (contains("attr_images")) { + if (isEmpty() || contains("attr_images")) { final Collection imagesc = html.getImages().values(); final String[] images = new String[imagesc.size()]; c = 0; @@ -223,7 +225,7 @@ public class SolrScheme extends ConfigurationSet { } // style sheets - if (contains("attr_css")) { + if (isEmpty() || contains("attr_css")) { final Map csss = html.getCSS(); final String[] css = new String[csss.size()]; c = 0; @@ -237,7 +239,7 @@ public class SolrScheme extends ConfigurationSet { } // Scripts - if (contains("attr_scripts")) { + if (isEmpty() || contains("attr_scripts")) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; @@ -249,7 +251,7 @@ public class SolrScheme extends ConfigurationSet { } // Frames - if (contains("attr_frames")) { + if (isEmpty() || contains("attr_frames")) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; @@ -261,7 +263,7 @@ public class SolrScheme extends ConfigurationSet { } // IFrames - if (contains("attr_iframes")) { + if (isEmpty() || contains("attr_iframes")) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; @@ -277,7 +279,7 @@ public class SolrScheme extends ConfigurationSet { // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { - if (contains("attr_" + model)) { + if (isEmpty() || contains("attr_" + model)) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { addSolr(solrdoc, "attr_" + model, scorenames); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 140185fe7..d334c5a12 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -403,13 +403,15 @@ dc_rights for (final Map.Entry entry: this.anchors.entrySet()) { url = entry.getKey(); if (url == null) continue; + final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex") >= 0; + final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow") >= 0; if ((thishost == null && url.getHost() == null) || ((thishost != null && url.getHost() != null) && (url.getHost().endsWith(thishost) || (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) { - this.inboundlinks.put(url, "anchor"); + this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } else { - this.outboundlinks.put(url, "anchor"); + this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } u = url.toNormalform(true, false); final String name = entry.getValue().getProperty("name", ""); @@ -605,6 +607,26 @@ dc_rights return (this.outboundlinks == null) ? 0 : this.outboundlinks.size(); } + public int inboundLinkNoindexCount() { + if (this.inboundlinks == null) resortLinks(); + if (this.inboundlinks == null) return 0; + int c = 0; + for (final String tag: this.inboundlinks.values()) { + if (tag.contains("noindex")) c++; + } + return c; + } + + public int outboundLinkNoindexCount() { + if (this.outboundlinks == null) resortLinks(); + if (this.outboundlinks == null) return 0; + int c = 0; + for (final String tag: this.outboundlinks.values()) { + if (tag.contains("noindex")) c++; + } + return c; + } + public Set inboundLinks() { if (this.inboundlinks == null) resortLinks(); return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();