mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added counting of links with noindex tag for solr index
- bugfixes for solr index git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7820 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
528b59e078
commit
2d4bb139d3
|
@ -75,15 +75,21 @@ wordcount_i
|
|||
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
|
||||
attr_inboundlinks
|
||||
|
||||
## number of inbound links, int
|
||||
## total number of inbound links, int
|
||||
inboundlinkscount_i
|
||||
|
||||
## number of inbound links with noindex tag, int
|
||||
inboundlinksnoindexcount_i
|
||||
|
||||
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
|
||||
attr_outboundlinks
|
||||
|
||||
## number of external links, int
|
||||
## total number of external links, int
|
||||
outboundlinkscount_i
|
||||
|
||||
## number of external links with noindex tag, int
|
||||
outboundlinksnoindexcount_i
|
||||
|
||||
## h1 header, textgen
|
||||
attr_h1
|
||||
|
||||
|
|
|
@ -61,11 +61,13 @@ public class IndexFederated_p {
|
|||
sb.solrConnector = null;
|
||||
}
|
||||
|
||||
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
|
||||
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
|
||||
|
||||
if (!solrWasOn && solrIsOnAfterwards) {
|
||||
// switch on
|
||||
final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
|
||||
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
|
||||
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list"));
|
||||
try {
|
||||
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
|
||||
} catch (final IOException e) {
|
||||
|
@ -75,7 +77,6 @@ public class IndexFederated_p {
|
|||
}
|
||||
|
||||
// read index scheme table flags
|
||||
final SolrScheme scheme = sb.solrConnector.getScheme();
|
||||
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
|
||||
ConfigurationSet.Entry entry;
|
||||
while (i.hasNext()) {
|
||||
|
|
|
@ -562,7 +562,8 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
// prepare a solr index profile switch list
|
||||
final File solrBackupProfile = new File("defaults/solr.keys.list");
|
||||
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
|
||||
final String schemename = getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
|
||||
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename);
|
||||
if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
|
||||
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
|
||||
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);
|
||||
|
|
|
@ -111,14 +111,14 @@ public class SolrScheme extends ConfigurationSet {
|
|||
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
|
||||
final String content = UTF8.String(yacydoc.getTextBytes());
|
||||
addSolr(solrdoc, "text_t", content);
|
||||
if (contains("wordcount_i")) {
|
||||
if (isEmpty() || contains("wordcount_i")) {
|
||||
final int contentwc = content.split(" ").length;
|
||||
addSolr(solrdoc, "wordcount_i", contentwc);
|
||||
}
|
||||
|
||||
// path elements of link
|
||||
final String path = digestURI.getPath();
|
||||
if (path != null && contains("attr_paths")) {
|
||||
if (path != null && (isEmpty() || contains("attr_paths"))) {
|
||||
final String[] paths = path.split("/");
|
||||
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
|
||||
}
|
||||
|
@ -126,8 +126,9 @@ public class SolrScheme extends ConfigurationSet {
|
|||
// list all links
|
||||
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
|
||||
int c = 0;
|
||||
addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
|
||||
if (contains("attr_inboundlinks")) {
|
||||
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
|
||||
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
|
||||
if (isEmpty() || contains("attr_inboundlinks")) {
|
||||
final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
|
||||
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
|
||||
final Properties p = alllinks.get(url);
|
||||
|
@ -135,23 +136,24 @@ public class SolrScheme extends ConfigurationSet {
|
|||
final String rel = p.getProperty("rel", "");
|
||||
inboundlinks[c++] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
">" +
|
||||
((name.length() > 0) ? name : "") + "</a>";
|
||||
}
|
||||
addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
|
||||
}
|
||||
c = 0;
|
||||
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
|
||||
if (contains("attr_outboundlinks")) {
|
||||
addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length);
|
||||
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
|
||||
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
|
||||
if (isEmpty() || contains("attr_outboundlinks")) {
|
||||
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
|
||||
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
|
||||
final Properties p = alllinks.get(url);
|
||||
final String name = p.getProperty("name", "");
|
||||
final String rel = p.getProperty("rel", "");
|
||||
outboundlinks[c++] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
">" +
|
||||
((name.length() > 0) ? name : "") + "</a>";
|
||||
}
|
||||
|
@ -196,7 +198,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
addSolr(solrdoc, "boldcount_i", bold.length);
|
||||
if (bold.length > 0) {
|
||||
addSolr(solrdoc, "attr_bold", bold);
|
||||
if (contains("attr_boldcount")) {
|
||||
if (isEmpty() || contains("attr_boldcount")) {
|
||||
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
|
||||
}
|
||||
}
|
||||
|
@ -204,7 +206,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
addSolr(solrdoc, "italiccount_i", italic.length);
|
||||
if (italic.length > 0) {
|
||||
addSolr(solrdoc, "attr_italic", italic);
|
||||
if (contains("attr_italiccount")) {
|
||||
if (isEmpty() || contains("attr_italiccount")) {
|
||||
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
|
||||
}
|
||||
}
|
||||
|
@ -213,7 +215,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
|
||||
|
||||
// images
|
||||
if (contains("attr_images")) {
|
||||
if (isEmpty() || contains("attr_images")) {
|
||||
final Collection<ImageEntry> imagesc = html.getImages().values();
|
||||
final String[] images = new String[imagesc.size()];
|
||||
c = 0;
|
||||
|
@ -223,7 +225,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
}
|
||||
|
||||
// style sheets
|
||||
if (contains("attr_css")) {
|
||||
if (isEmpty() || contains("attr_css")) {
|
||||
final Map<MultiProtocolURI, String> csss = html.getCSS();
|
||||
final String[] css = new String[csss.size()];
|
||||
c = 0;
|
||||
|
@ -237,7 +239,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
}
|
||||
|
||||
// Scripts
|
||||
if (contains("attr_scripts")) {
|
||||
if (isEmpty() || contains("attr_scripts")) {
|
||||
final Set<MultiProtocolURI> scriptss = html.getScript();
|
||||
final String[] scripts = new String[scriptss.size()];
|
||||
c = 0;
|
||||
|
@ -249,7 +251,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
}
|
||||
|
||||
// Frames
|
||||
if (contains("attr_frames")) {
|
||||
if (isEmpty() || contains("attr_frames")) {
|
||||
final Set<MultiProtocolURI> framess = html.getFrames();
|
||||
final String[] frames = new String[framess.size()];
|
||||
c = 0;
|
||||
|
@ -261,7 +263,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
}
|
||||
|
||||
// IFrames
|
||||
if (contains("attr_iframes")) {
|
||||
if (isEmpty() || contains("attr_iframes")) {
|
||||
final Set<MultiProtocolURI> iframess = html.getIFrames();
|
||||
final String[] iframes = new String[iframess.size()];
|
||||
c = 0;
|
||||
|
@ -277,7 +279,7 @@ public class SolrScheme extends ConfigurationSet {
|
|||
|
||||
// generic evaluation pattern
|
||||
for (final String model: html.getEvaluationModelNames()) {
|
||||
if (contains("attr_" + model)) {
|
||||
if (isEmpty() || contains("attr_" + model)) {
|
||||
final String[] scorenames = html.getEvaluationModelScoreNames(model);
|
||||
if (scorenames.length > 0) {
|
||||
addSolr(solrdoc, "attr_" + model, scorenames);
|
||||
|
|
|
@ -403,13 +403,15 @@ dc_rights
|
|||
for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
|
||||
url = entry.getKey();
|
||||
if (url == null) continue;
|
||||
final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex") >= 0;
|
||||
final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow") >= 0;
|
||||
if ((thishost == null && url.getHost() == null) ||
|
||||
((thishost != null && url.getHost() != null) &&
|
||||
(url.getHost().endsWith(thishost) ||
|
||||
(thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) {
|
||||
this.inboundlinks.put(url, "anchor");
|
||||
this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
|
||||
} else {
|
||||
this.outboundlinks.put(url, "anchor");
|
||||
this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
|
||||
}
|
||||
u = url.toNormalform(true, false);
|
||||
final String name = entry.getValue().getProperty("name", "");
|
||||
|
@ -605,6 +607,26 @@ dc_rights
|
|||
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
|
||||
}
|
||||
|
||||
public int inboundLinkNoindexCount() {
|
||||
if (this.inboundlinks == null) resortLinks();
|
||||
if (this.inboundlinks == null) return 0;
|
||||
int c = 0;
|
||||
for (final String tag: this.inboundlinks.values()) {
|
||||
if (tag.contains("noindex")) c++;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public int outboundLinkNoindexCount() {
|
||||
if (this.outboundlinks == null) resortLinks();
|
||||
if (this.outboundlinks == null) return 0;
|
||||
int c = 0;
|
||||
for (final String tag: this.outboundlinks.values()) {
|
||||
if (tag.contains("noindex")) c++;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public Set<MultiProtocolURI> inboundLinks() {
|
||||
if (this.inboundlinks == null) resortLinks();
|
||||
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();
|
||||
|
|
Loading…
Reference in New Issue
Block a user