- added counting of links with noindex tag for solr index

- bugfixes for solr index

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7820 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-07-03 06:40:05 +00:00
parent 528b59e078
commit 2d4bb139d3
5 changed files with 56 additions and 24 deletions

View File

@ -75,15 +75,21 @@ wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks
## number of inbound links, int
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with noindex tag, int
inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks
## number of external links, int
## total number of external links, int
outboundlinkscount_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
## h1 header, textgen
attr_h1

View File

@ -61,11 +61,13 @@ public class IndexFederated_p {
sb.solrConnector = null;
}
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
if (!solrWasOn && solrIsOnAfterwards) {
// switch on
final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list"));
try {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) {
@ -75,7 +77,6 @@ public class IndexFederated_p {
}
// read index scheme table flags
final SolrScheme scheme = sb.solrConnector.getScheme();
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
ConfigurationSet.Entry entry;
while (i.hasNext()) {

View File

@ -562,7 +562,8 @@ public final class Switchboard extends serverSwitch {
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
final String schemename = getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename);
if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);

View File

@ -111,14 +111,14 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes());
addSolr(solrdoc, "text_t", content);
if (contains("wordcount_i")) {
if (isEmpty() || contains("wordcount_i")) {
final int contentwc = content.split(" ").length;
addSolr(solrdoc, "wordcount_i", contentwc);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && contains("attr_paths")) {
if (path != null && (isEmpty() || contains("attr_paths"))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
}
@ -126,8 +126,9 @@ public class SolrScheme extends ConfigurationSet {
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0;
addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (contains("attr_inboundlinks")) {
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains("attr_inboundlinks")) {
final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
@ -135,23 +136,24 @@ public class SolrScheme extends ConfigurationSet {
final String rel = p.getProperty("rel", "");
inboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
}
c = 0;
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
if (contains("attr_outboundlinks")) {
addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length);
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains("attr_outboundlinks")) {
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", "");
outboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
@ -196,7 +198,7 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "boldcount_i", bold.length);
if (bold.length > 0) {
addSolr(solrdoc, "attr_bold", bold);
if (contains("attr_boldcount")) {
if (isEmpty() || contains("attr_boldcount")) {
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
}
}
@ -204,7 +206,7 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "italiccount_i", italic.length);
if (italic.length > 0) {
addSolr(solrdoc, "attr_italic", italic);
if (contains("attr_italiccount")) {
if (isEmpty() || contains("attr_italiccount")) {
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
}
}
@ -213,7 +215,7 @@ public class SolrScheme extends ConfigurationSet {
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images
if (contains("attr_images")) {
if (isEmpty() || contains("attr_images")) {
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] images = new String[imagesc.size()];
c = 0;
@ -223,7 +225,7 @@ public class SolrScheme extends ConfigurationSet {
}
// style sheets
if (contains("attr_css")) {
if (isEmpty() || contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css = new String[csss.size()];
c = 0;
@ -237,7 +239,7 @@ public class SolrScheme extends ConfigurationSet {
}
// Scripts
if (contains("attr_scripts")) {
if (isEmpty() || contains("attr_scripts")) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
@ -249,7 +251,7 @@ public class SolrScheme extends ConfigurationSet {
}
// Frames
if (contains("attr_frames")) {
if (isEmpty() || contains("attr_frames")) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
@ -261,7 +263,7 @@ public class SolrScheme extends ConfigurationSet {
}
// IFrames
if (contains("attr_iframes")) {
if (isEmpty() || contains("attr_iframes")) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
@ -277,7 +279,7 @@ public class SolrScheme extends ConfigurationSet {
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (contains("attr_" + model)) {
if (isEmpty() || contains("attr_" + model)) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, "attr_" + model, scorenames);

View File

@ -403,13 +403,15 @@ dc_rights
for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex") >= 0;
final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow") >= 0;
if ((thishost == null && url.getHost() == null) ||
((thishost != null && url.getHost() != null) &&
(url.getHost().endsWith(thishost) ||
(thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) {
this.inboundlinks.put(url, "anchor");
this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
} else {
this.outboundlinks.put(url, "anchor");
this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
}
u = url.toNormalform(true, false);
final String name = entry.getValue().getProperty("name", "");
@ -605,6 +607,26 @@ dc_rights
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public int inboundLinkNoindexCount() {
if (this.inboundlinks == null) resortLinks();
if (this.inboundlinks == null) return 0;
int c = 0;
for (final String tag: this.inboundlinks.values()) {
if (tag.contains("noindex")) c++;
}
return c;
}
public int outboundLinkNoindexCount() {
if (this.outboundlinks == null) resortLinks();
if (this.outboundlinks == null) return 0;
int c = 0;
for (final String tag: this.outboundlinks.values()) {
if (tag.contains("noindex")) c++;
}
return c;
}
public Set<MultiProtocolURI> inboundLinks() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();