updated solr scheme: generic declaration of solr schemes

This commit is contained in:
Michael Peter Christen 2012-01-13 11:25:15 +01:00
parent 254adea51c
commit 987b412491
3 changed files with 333 additions and 147 deletions

View File

@ -46,16 +46,16 @@ keywords
charset_s
## tags of css entries, normalized with absolute URL, textgen
attr_css_tag
css_tag_txt
## urls of css entries, normalized with absolute URL, textgen
attr_css_url
css_url_txt
## number of css entries, int
csscount_i
## urls of script entries, normalized with absolute URL, textgen
attr_scripts
scripts_txt
## number of script entries, int
scriptscount_i
@ -86,25 +86,25 @@ text_t
wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks_tag
inboundlinks_tag_txt
## internal links, only the protocol
#attr_inboundlinks_protocol
#inboundlinks_protocol_txt
## internal links, the url only without the protocol
#attr_inboundlinks_urlstub
#inboundlinks_urlstub_txt
## internal links, the name property of the a-tag
#attr_inboundlinks_name
#inboundlinks_name_txt
## internal links, the rel property of the a-tag
#attr_inboundlinks_rel
#inboundlinks_rel_txt
## internal links, the rel property of the a-tag, coded binary
#attr_inboundlinks_relflags
#inboundlinks_relflags_txt
## internal links, the text content of the a-tag
#attr_inboundlinks_text
#inboundlinks_text_txt
## total number of inbound links, int
inboundlinkscount_i
@ -113,70 +113,70 @@ inboundlinkscount_i
inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks_tag
outboundlinks_tag_txt
## external links, only the protocol
#attr_outboundlinks_protocol
#outboundlinks_protocol_txt
## external links, the url only without the protocol
#attr_outboundlinks_urlstub
#outboundlinks_urlstub_txt
## external links, the name property of the a-tag
#attr_outboundlinks_name
#outboundlinks_name_txt
## external links, the rel property of the a-tag
#attr_outboundlinks_rel
#outboundlinks_rel_txt
## external links, the rel property of the a-tag, coded binary
#attr_outboundlinks_relflags
#outboundlinks_relflags_txt
## external links, the text content of the a-tag
#attr_outboundlinks_text
#outboundlinks_text_txt
## external number of inbound links, int
outboundlinks_i
outboundlinkscount_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
attr_images_tag
images_tag_txt
## all image links without the protocol and '://'
#attr_images_urlstub
#images_urlstub_txt
## all image link protocols
#attr_images_protocol
#images_protocol_txt
## all image link alt tag
#attr_images_alt
#images_alt_txt
## number of images, int
imagescount_i
## h1 header, textgen
attr_h1
h1_txt
## h2 header, textgen
attr_h2
h2_txt
## h3 header, textgen
attr_h3
h3_txt
## h4 header, textgen
attr_h4
h4_txt
## h5 header, textgen
attr_h5
h5_txt
## h6 header, textgen
attr_h6
h6_txt
## binary pattern for the existance of h1..h6 headlines, int
htags_i
## all path elements in the url, textgen
attr_paths
paths_txt
## host of the url, string
host_s
@ -185,79 +185,80 @@ host_s
canonical_s
## all texts in <li> tags, textgen
attr_li
li_txt
## number of <li> tags, int
licount_i
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
attr_bold
bold_txt
## number of occurrences of texts in attr_bold, textgen
attr_boldcount
## number of occurrences of texts in bold_txt, textgen
#bold_val
## total number of occurrences of <b> or <strong>, int
bold_i
boldcount_i
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
attr_italic
italic_txt
## number of occurrences of texts in attr_italic, textgen
attr_italiccount
## number of occurrences of texts in italic_txt, textgen
#italic_val
## total number of occurrences of <i>, int
italic_i
italiccount_i
## flag that shows if a swf file is linked, boolean
flash_b
## list of all links to frames, textgen
attr_frames
frames_txt
## number of attr_frames, int
framesscount_i
## list of all links to iframes, textgen
attr_iframes
iframes_txt
## number of attr_iframes, int
iframesscount_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
attr_cms
#ext_cms_txt
##number of attributes that count for a specific cms in attr_cms, textgen
attr_cmscount
#ext_cms_val
## names of ad-servers/ad-services, textgen
attr_ads
#ext_ads_txt
## number of attributes counts in attr_ads, textgen
attr_adscount
#ext_ads_val
## names of recognized community functions, textgen
attr_community
#ext_community_txt
## number of attribute counts in attr_community, textgen
attr_communitycount
#ext_community_val
## names of map services, textgen
attr_maps
#ext_maps_txt
## number of attribute counts in attr_maps, textgen
attr_mapscount
#ext_maps_val
## names of tracker server, textgen
attr_tracker
#ext_tracker_txt
## number of attribute counts in attr_tracker, textgen
attr_trackercount
#ext_tracker_val
## names matching title expressions, textgen
attr_title
#ext_title_txt
## number of matching title expressions, textgen
attr_titlecount
#ext_title_val
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t

View File

@ -44,6 +44,7 @@ import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -65,71 +66,238 @@ public class SolrScheme extends ConfigurationSet {
*/
public SolrScheme(final File configurationFile) {
super(configurationFile);
// check consistency: compare with Field enum
for (String name: this) {
try {
Field.valueOf(name);
} catch (IllegalArgumentException e) {
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + name + "'");
}
}
/*
for (Field field: Field.values()) {
if (!this.contains(field.name())) {
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " omits known attribute '" + field.name() + "'");
}
}
*/
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost);
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
}
public static enum Types {
string,
text_general,
text_en_splitting_tight,
date,
integer("int"),
tdouble,
bool("boolean");
private String printName;
private Types() {
this.printName = this.name();
}
private Types(String printName) {
this.printName = printName;
}
public String printName() {
return this.printName;
}
}
public static enum Field {
id(Types.string, true, true),
sku(Types.text_en_splitting_tight, true, true, false, true),
ip_s(Types.string, true, true),
host_s(Types.string, true, true),
title(Types.text_general, true, true, true),
author(Types.text_general, true, true),
description(Types.text_general, true, true),
content_type(Types.string, true, true, true),
last_modified(Types.date, true, true),
keywords(Types.text_general, true, true),
text_t(Types.text_general, true, true),
wordcount_i(Types.integer, true, true),
paths_txt(Types.text_general, true, true, true),
inboundlinkscount_i(Types.integer, true, true),
inboundlinksnoindexcount_i(Types.integer, true, true),
inboundlinks_tag_txt(Types.text_general, true, true, true),
inboundlinks_protocol_txt(Types.text_general, true, true, true),
inboundlinks_urlstub_txt(Types.text_general, true, true, true),
inboundlinks_name_txt(Types.text_general, true, true, true),
inboundlinks_rel_txt(Types.text_general, true, true, true),
inboundlinks_relflags_txt(Types.text_general, true, true, true),
inboundlinks_text_txt(Types.text_general, true, true, true),
outboundlinkscount_i(Types.integer, true, true),
outboundlinksnoindexcount_i(Types.integer, true, true),
outboundlinks_tag_txt(Types.text_general, true, true, true),
outboundlinks_protocol_txt(Types.text_general, true, true, true),
outboundlinks_urlstub_txt(Types.text_general, true, true, true),
outboundlinks_name_txt(Types.text_general, true, true, true),
outboundlinks_rel_txt(Types.text_general, true, true, true),
outboundlinks_relflags_txt(Types.text_general, true, true, true),
outboundlinks_text_txt(Types.text_general, true, true, true),
charset_s(Types.string, true, true),
lon_coordinate(Types.tdouble, true, false),
lat_coordinate(Types.tdouble, true, false),
httpstatus_i(Types.integer, true, true),
h1_txt(Types.text_general, true, true, true),
h2_txt(Types.text_general, true, true, true),
h3_txt(Types.text_general, true, true, true),
h4_txt(Types.text_general, true, true, true),
h5_txt(Types.text_general, true, true, true),
h6_txt(Types.text_general, true, true, true),
htags_i(Types.integer, true, true),
canonical_s(Types.string, true, true),
robots_i(Types.integer, true, true),
metagenerator_t(Types.text_general, true, true),
boldcount_i(Types.integer, true, true),
bold_txt(Types.text_general, true, true, true),
bold_val(Types.integer, true, true, true),
italiccount_i(Types.integer, true, true),
italic_txt(Types.text_general, true, true, true),
italic_val(Types.integer, true, true, true),
licount_i(Types.integer, true, true),
li_txt(Types.text_general, true, true, true),
imagescount_i(Types.integer, true, true),
images_tag_txt(Types.text_general, true, true, true),
images_protocol_txt(Types.text_general, true, true, true),
images_urlstub_txt(Types.text_general, true, true, true),
images_alt_txt(Types.text_general, true, true, true),
csscount_i(Types.integer, true, true),
css_tag_txt(Types.text_general, true, true, true),
css_url_txt(Types.text_general, true, true, true),
scripts_txt(Types.text_general, true, true, true),
scriptscount_i(Types.integer, true, true),
frames_txt(Types.text_general, true, true, true),
framesscount_i(Types.integer, true, true),
iframes_txt(Types.text_general, true, true, true),
iframesscount_i(Types.integer, true, true),
flash_b(Types.bool, true, true),
responsetime_i(Types.integer, true, true),
ext_cms_txt(Types.text_general, true, true, true),
ext_cms_val(Types.integer, true, true, true),
ext_ads_txt(Types.text_general, true, true, true),
ext_ads_val(Types.integer, true, true, true),
ext_community_txt(Types.text_general, true, true, true),
ext_community_val(Types.integer, true, true, true),
ext_maps_txt(Types.text_general, true, true, true),
ext_maps_val(Types.integer, true, true, true),
ext_tracker_txt(Types.text_general, true, true, true),
ext_tracker_val(Types.integer, true, true, true),
ext_title_txt(Types.text_general, true, true, true),
ext_title_val(Types.integer, true, true, true),
failreason_t(Types.text_general, true, true);
final Types type;
final boolean indexed, stored;
boolean multiValued, omitNorms;
private Field(final Types type, final boolean indexed, final boolean stored) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = false;
this.omitNorms = false;
}
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued) {
this(type, indexed, stored);
this.multiValued = multiValued;
}
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms) {
this(type, indexed, stored, multiValued);
this.omitNorms = omitNorms;
}
public final Types getType() {
return this.type;
}
public final boolean isIndexed() {
return this.indexed;
}
public final boolean isStored() {
return this.stored;
}
public final boolean isMultiValued() {
return this.multiValued;
}
public final boolean isOmitNorms() {
return this.omitNorms;
}
}
public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
// we user the SolrCell design as index scheme
final SolrInputDocument solrdoc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, "id", id);
addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
addSolr(solrdoc, Field.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, Field.id, id);
addSolr(solrdoc, Field.sku, digestURI.toNormalform(true, false), 3.0f);
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
addSolr(solrdoc, "title", yacydoc.dc_title());
addSolr(solrdoc, "author", yacydoc.dc_creator());
addSolr(solrdoc, "description", yacydoc.dc_description());
addSolr(solrdoc, "content_type", yacydoc.dc_format());
addSolr(solrdoc, "last_modified", header.lastModified());
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
if (address != null) addSolr(solrdoc, Field.ip_s, address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, Field.host_s, digestURI.getHost());
addSolr(solrdoc, Field.title, yacydoc.dc_title());
addSolr(solrdoc, Field.author, yacydoc.dc_creator());
addSolr(solrdoc, Field.description, yacydoc.dc_description());
addSolr(solrdoc, Field.content_type, yacydoc.dc_format());
addSolr(solrdoc, Field.last_modified, header.lastModified());
addSolr(solrdoc, Field.keywords, yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes());
addSolr(solrdoc, "text_t", content);
if (isEmpty() || contains("wordcount_i")) {
addSolr(solrdoc, Field.text_t, content);
if (isEmpty() || contains(Field.wordcount_i.name())) {
final int contentwc = content.split(" ").length;
addSolr(solrdoc, "wordcount_i", contentwc);
addSolr(solrdoc, Field.wordcount_i, contentwc);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (isEmpty() || contains("attr_paths"))) {
if (path != null && (isEmpty() || contains(Field.paths_txt.name()))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0;
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
if (isEmpty() || contains(Field.inboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.inboundlinksnoindexcount_i, yacydoc.inboundLinkNoindexCount());
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
@ -156,17 +324,17 @@ public class SolrScheme extends ConfigurationSet {
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
if (isEmpty() || contains("attr_inboundlinks_relflags")) addSolr(solrdoc, "attr_inboundlinks_relflags", relEval(inboundlinksRel));
if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, inboundlinksURLProtocol);
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
if (isEmpty() || contains(Field.outboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.outboundlinksnoindexcount_i, yacydoc.outboundLinkNoindexCount());
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
@ -193,24 +361,24 @@ public class SolrScheme extends ConfigurationSet {
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
if (isEmpty() || contains("attr_outboundlinks_relflags")) addSolr(solrdoc, "attr_outboundlinks_relflags", relEval(inboundlinksRel));
if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, outboundlinksURLProtocol);
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, "charset_s", yacydoc.getCharset());
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, "lon_coordinate", yacydoc.lon());
addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, "httpstatus_i", 200);
addSolr(solrdoc, Field.httpstatus_i, 200);
final Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
@ -218,16 +386,19 @@ public class SolrScheme extends ConfigurationSet {
// header tags
int h = 0;
int f = 1;
for (int i = 1; i <= 6; i++) {
final String[] hs = html.getHeadlines(i);
h = h | (hs.length > 0 ? f : 0);
f = f * 2;
addSolr(solrdoc, "attr_h" + i, hs);
}
addSolr(solrdoc, "htags_i", h);
String[] hs;
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h1_txt, hs);
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h2_txt, hs);
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h3_txt, hs);
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h4_txt, hs);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h5_txt, hs);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h6_txt, hs);
addSolr(solrdoc, Field.htags_i, h);
// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
if (html.getCanonical() != null) addSolr(solrdoc, Field.canonical_s, html.getCanonical().toNormalform(false, false));
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
@ -261,32 +432,32 @@ public class SolrScheme extends ConfigurationSet {
if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11
if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
}
addSolr(solrdoc, "robots_i", b);
addSolr(solrdoc, Field.robots_i, b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);
if (generator != null) addSolr(solrdoc, Field.metagenerator_t, generator);
// bold, italic
final String[] bold = html.getBold();
addSolr(solrdoc, "boldcount_i", bold.length);
addSolr(solrdoc, Field.boldcount_i, bold.length);
if (bold.length > 0) {
addSolr(solrdoc, "attr_bold", bold);
if (isEmpty() || contains("attr_boldcount")) {
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
addSolr(solrdoc, Field.bold_txt, bold);
if (isEmpty() || contains(Field.bold_val.name())) {
addSolr(solrdoc, Field.bold_val, html.getBoldCount(bold));
}
}
final String[] italic = html.getItalic();
addSolr(solrdoc, "italiccount_i", italic.length);
addSolr(solrdoc, Field.italiccount_i, italic.length);
if (italic.length > 0) {
addSolr(solrdoc, "attr_italic", italic);
if (isEmpty() || contains("attr_italiccount")) {
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
addSolr(solrdoc, Field.italic_txt, italic);
if (isEmpty() || contains(Field.italic_val.name())) {
addSolr(solrdoc, Field.italic_val, html.getItalicCount(italic));
}
}
final String[] li = html.getLi();
addSolr(solrdoc, "licount_i", li.length);
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
addSolr(solrdoc, Field.licount_i, li.length);
if (li.length > 0) addSolr(solrdoc, Field.li_txt, li);
// images
final Collection<ImageEntry> imagesc = html.getImages().values();
@ -303,14 +474,14 @@ public class SolrScheme extends ConfigurationSet {
imgalts[c] = ie.alt();
c++;
}
addSolr(solrdoc, "imagescount_i", imgtags.length);
if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
addSolr(solrdoc, Field.imagescount_i, imgtags.length);
if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags);
if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, imgprots);
if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts);
// style sheets
if (isEmpty() || contains("attr_css")) {
if (isEmpty() || contains("css_txt")) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
@ -323,63 +494,64 @@ public class SolrScheme extends ConfigurationSet {
css_url[c] = url;
c++;
}
addSolr(solrdoc, "csscount_i", css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
addSolr(solrdoc, Field.csscount_i, css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, Field.css_tag_txt, css_tag);
if (css_url.length > 0) addSolr(solrdoc, Field.css_url_txt, css_url);
}
// Scripts
if (isEmpty() || contains("attr_scripts")) {
if (isEmpty() || contains(Field.scripts_txt.name())) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final MultiProtocolURI url: scriptss) {
scripts[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "scriptscount_i", scripts.length);
if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
addSolr(solrdoc, Field.scriptscount_i, scripts.length);
if (scripts.length > 0) addSolr(solrdoc, Field.scripts_txt, scripts);
}
// Frames
if (isEmpty() || contains("attr_frames")) {
if (isEmpty() || contains(Field.frames_txt.name())) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "framesscount_i", frames.length);
if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
addSolr(solrdoc, Field.framesscount_i, frames.length);
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
}
// IFrames
if (isEmpty() || contains("attr_iframes")) {
if (isEmpty() || contains(Field.iframes_txt.name()
)) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "iframesscount_i", iframes.length);
if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
addSolr(solrdoc, Field.iframesscount_i, iframes.length);
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
}
// flash embedded
addSolr(solrdoc, "flash_b", html.containsFlash());
addSolr(solrdoc, Field.flash_b, html.containsFlash());
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (isEmpty() || contains("attr_" + model)) {
if (isEmpty() || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, "attr_" + model, scorenames);
addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
addSolr(solrdoc, Field.valueOf("ext_" + model + "_txt"), scorenames);
addSolr(solrdoc, Field.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time
addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
}
return solrdoc;
}

View File

@ -143,6 +143,7 @@ public class SolrSingleConnector implements SolrConnector {
public void pleaseStop() {
this.shallRun = false;
}
@Override
public void run() {
while (this.shallRun) {
if (SolrSingleConnector.this.transmissionQueue[this.idx].size() > 0) {
@ -165,6 +166,7 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public void close() {
for (int i = 0; i < transmissionQueueCount; i++) {
if (this.transmissionWorker[i].isAlive()) {
@ -204,6 +206,7 @@ public class SolrSingleConnector implements SolrConnector {
* delete everything in the solr index
* @throws IOException
*/
@Override
public void clear() throws IOException {
try {
this.server.deleteByQuery("*:*");
@ -213,6 +216,7 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public void delete(final String id) throws IOException {
try {
this.server.deleteById(id);
@ -221,6 +225,7 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public void delete(final List<String> ids) throws IOException {
try {
this.server.deleteById(ids);
@ -229,6 +234,7 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public boolean exists(final String id) throws IOException {
try {
final SolrDocumentList list = get("id:" + id, 0, 1);
@ -254,10 +260,12 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException, SolrException {
add(this.scheme.yacy2solr(id, header, doc));
}
@Override
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
int thisrrc = this.transmissionRoundRobinCounter;
int nextrrc = thisrrc++;
@ -284,11 +292,15 @@ public class SolrSingleConnector implements SolrConnector {
req.add( docs );
UpdateResponse rsp = req.process( server );
*/
} catch (final SolrException e) {
// the field is probably not known
Log.logWarning("SolrConnector", e.getMessage());
} catch (final Throwable e) {
throw new IOException(e);
}
}
@Override
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument();
@ -330,6 +342,7 @@ public class SolrSingleConnector implements SolrConnector {
* @param querystring
* @throws IOException
*/
@Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
// construct query
final SolrQuery query = new SolrQuery();