mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
updated solr scheme: generic declaration of solr schemes
This commit is contained in:
parent
254adea51c
commit
987b412491
|
@ -46,16 +46,16 @@ keywords
|
|||
charset_s
|
||||
|
||||
## tags of css entries, normalized with absolute URL, textgen
|
||||
attr_css_tag
|
||||
css_tag_txt
|
||||
|
||||
## urls of css entries, normalized with absolute URL, textgen
|
||||
attr_css_url
|
||||
css_url_txt
|
||||
|
||||
## number of css entries, int
|
||||
csscount_i
|
||||
|
||||
## urls of script entries, normalized with absolute URL, textgen
|
||||
attr_scripts
|
||||
scripts_txt
|
||||
|
||||
## number of script entries, int
|
||||
scriptscount_i
|
||||
|
@ -86,25 +86,25 @@ text_t
|
|||
wordcount_i
|
||||
|
||||
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
|
||||
attr_inboundlinks_tag
|
||||
inboundlinks_tag_txt
|
||||
|
||||
## internal links, only the protocol
|
||||
#attr_inboundlinks_protocol
|
||||
#inboundlinks_protocol_txt
|
||||
|
||||
## internal links, the url only without the protocol
|
||||
#attr_inboundlinks_urlstub
|
||||
#inboundlinks_urlstub_txt
|
||||
|
||||
## internal links, the name property of the a-tag
|
||||
#attr_inboundlinks_name
|
||||
#inboundlinks_name_txt
|
||||
|
||||
## internal links, the rel property of the a-tag
|
||||
#attr_inboundlinks_rel
|
||||
#inboundlinks_rel_txt
|
||||
|
||||
## internal links, the rel property of the a-tag, coded binary
|
||||
#attr_inboundlinks_relflags
|
||||
#inboundlinks_relflags_txt
|
||||
|
||||
## internal links, the text content of the a-tag
|
||||
#attr_inboundlinks_text
|
||||
#inboundlinks_text_txt
|
||||
|
||||
## total number of inbound links, int
|
||||
inboundlinkscount_i
|
||||
|
@ -113,70 +113,70 @@ inboundlinkscount_i
|
|||
inboundlinksnoindexcount_i
|
||||
|
||||
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
|
||||
attr_outboundlinks_tag
|
||||
outboundlinks_tag_txt
|
||||
|
||||
## external links, only the protocol
|
||||
#attr_outboundlinks_protocol
|
||||
#outboundlinks_protocol_txt
|
||||
|
||||
## external links, the url only without the protocol
|
||||
#attr_outboundlinks_urlstub
|
||||
#outboundlinks_urlstub_txt
|
||||
|
||||
## external links, the name property of the a-tag
|
||||
#attr_outboundlinks_name
|
||||
#outboundlinks_name_txt
|
||||
|
||||
## external links, the rel property of the a-tag
|
||||
#attr_outboundlinks_rel
|
||||
#outboundlinks_rel_txt
|
||||
|
||||
## external links, the rel property of the a-tag, coded binary
|
||||
#attr_outboundlinks_relflags
|
||||
#outboundlinks_relflags_txt
|
||||
|
||||
## external links, the text content of the a-tag
|
||||
#attr_outboundlinks_text
|
||||
#outboundlinks_text_txt
|
||||
|
||||
## external number of inbound links, int
|
||||
outboundlinks_i
|
||||
outboundlinkscount_i
|
||||
|
||||
## number of external links with noindex tag, int
|
||||
outboundlinksnoindexcount_i
|
||||
|
||||
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
|
||||
attr_images_tag
|
||||
images_tag_txt
|
||||
|
||||
## all image links without the protocol and '://'
|
||||
#attr_images_urlstub
|
||||
#images_urlstub_txt
|
||||
|
||||
## all image link protocols
|
||||
#attr_images_protocol
|
||||
#images_protocol_txt
|
||||
|
||||
## all image link alt tag
|
||||
#attr_images_alt
|
||||
#images_alt_txt
|
||||
|
||||
## number of images, int
|
||||
imagescount_i
|
||||
|
||||
## h1 header, textgen
|
||||
attr_h1
|
||||
h1_txt
|
||||
|
||||
## h2 header, textgen
|
||||
attr_h2
|
||||
h2_txt
|
||||
|
||||
## h3 header, textgen
|
||||
attr_h3
|
||||
h3_txt
|
||||
|
||||
## h4 header, textgen
|
||||
attr_h4
|
||||
h4_txt
|
||||
|
||||
## h5 header, textgen
|
||||
attr_h5
|
||||
h5_txt
|
||||
|
||||
## h6 header, textgen
|
||||
attr_h6
|
||||
h6_txt
|
||||
|
||||
## binary pattern for the existance of h1..h6 headlines, int
|
||||
htags_i
|
||||
|
||||
## all path elements in the url, textgen
|
||||
attr_paths
|
||||
paths_txt
|
||||
|
||||
## host of the url, string
|
||||
host_s
|
||||
|
@ -185,79 +185,80 @@ host_s
|
|||
canonical_s
|
||||
|
||||
## all texts in <li> tags, textgen
|
||||
attr_li
|
||||
li_txt
|
||||
|
||||
## number of <li> tags, int
|
||||
licount_i
|
||||
|
||||
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
|
||||
attr_bold
|
||||
bold_txt
|
||||
|
||||
## number of occurrences of texts in attr_bold, textgen
|
||||
attr_boldcount
|
||||
## number of occurrences of texts in bold_txt, textgen
|
||||
#bold_val
|
||||
|
||||
## total number of occurrences of <b> or <strong>, int
|
||||
bold_i
|
||||
boldcount_i
|
||||
|
||||
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
|
||||
attr_italic
|
||||
italic_txt
|
||||
|
||||
## number of occurrences of texts in attr_italic, textgen
|
||||
attr_italiccount
|
||||
## number of occurrences of texts in italic_txt, textgen
|
||||
#italic_val
|
||||
|
||||
## total number of occurrences of <i>, int
|
||||
italic_i
|
||||
italiccount_i
|
||||
|
||||
## flag that shows if a swf file is linked, boolean
|
||||
flash_b
|
||||
|
||||
## list of all links to frames, textgen
|
||||
attr_frames
|
||||
frames_txt
|
||||
|
||||
## number of attr_frames, int
|
||||
framesscount_i
|
||||
|
||||
## list of all links to iframes, textgen
|
||||
attr_iframes
|
||||
iframes_txt
|
||||
|
||||
## number of attr_iframes, int
|
||||
iframesscount_i
|
||||
|
||||
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
|
||||
attr_cms
|
||||
#ext_cms_txt
|
||||
|
||||
##number of attributes that count for a specific cms in attr_cms, textgen
|
||||
attr_cmscount
|
||||
#ext_cms_val
|
||||
|
||||
## names of ad-servers/ad-services, textgen
|
||||
attr_ads
|
||||
#ext_ads_txt
|
||||
|
||||
## number of attributes counts in attr_ads, textgen
|
||||
attr_adscount
|
||||
#ext_ads_val
|
||||
|
||||
## names of recognized community functions, textgen
|
||||
attr_community
|
||||
#ext_community_txt
|
||||
|
||||
## number of attribute counts in attr_community, textgen
|
||||
attr_communitycount
|
||||
#ext_community_val
|
||||
|
||||
## names of map services, textgen
|
||||
attr_maps
|
||||
#ext_maps_txt
|
||||
|
||||
## number of attribute counts in attr_maps, textgen
|
||||
attr_mapscount
|
||||
#ext_maps_val
|
||||
|
||||
## names of tracker server, textgen
|
||||
attr_tracker
|
||||
#ext_tracker_txt
|
||||
|
||||
## number of attribute counts in attr_tracker, textgen
|
||||
attr_trackercount
|
||||
#ext_tracker_val
|
||||
|
||||
## names matching title expressions, textgen
|
||||
attr_title
|
||||
#ext_title_txt
|
||||
|
||||
## number of matching title expressions, textgen
|
||||
attr_titlecount
|
||||
#ext_title_val
|
||||
|
||||
|
||||
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
|
||||
failreason_t
|
||||
|
|
|
@ -44,6 +44,7 @@ import net.yacy.document.Document;
|
|||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
@ -65,71 +66,238 @@ public class SolrScheme extends ConfigurationSet {
|
|||
*/
|
||||
public SolrScheme(final File configurationFile) {
|
||||
super(configurationFile);
|
||||
// check consistency: compare with Field enum
|
||||
for (String name: this) {
|
||||
try {
|
||||
Field.valueOf(name);
|
||||
} catch (IllegalArgumentException e) {
|
||||
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + name + "'");
|
||||
}
|
||||
}
|
||||
/*
|
||||
for (Field field: Field.values()) {
|
||||
if (!this.contains(field.name())) {
|
||||
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " omits known attribute '" + field.name() + "'");
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) {
|
||||
if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost);
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
|
||||
}
|
||||
|
||||
public static enum Types {
|
||||
string,
|
||||
text_general,
|
||||
text_en_splitting_tight,
|
||||
date,
|
||||
integer("int"),
|
||||
tdouble,
|
||||
bool("boolean");
|
||||
|
||||
private String printName;
|
||||
private Types() {
|
||||
this.printName = this.name();
|
||||
}
|
||||
private Types(String printName) {
|
||||
this.printName = printName;
|
||||
}
|
||||
public String printName() {
|
||||
return this.printName;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum Field {
|
||||
|
||||
id(Types.string, true, true),
|
||||
sku(Types.text_en_splitting_tight, true, true, false, true),
|
||||
ip_s(Types.string, true, true),
|
||||
host_s(Types.string, true, true),
|
||||
title(Types.text_general, true, true, true),
|
||||
author(Types.text_general, true, true),
|
||||
description(Types.text_general, true, true),
|
||||
content_type(Types.string, true, true, true),
|
||||
last_modified(Types.date, true, true),
|
||||
keywords(Types.text_general, true, true),
|
||||
text_t(Types.text_general, true, true),
|
||||
wordcount_i(Types.integer, true, true),
|
||||
paths_txt(Types.text_general, true, true, true),
|
||||
inboundlinkscount_i(Types.integer, true, true),
|
||||
inboundlinksnoindexcount_i(Types.integer, true, true),
|
||||
inboundlinks_tag_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_protocol_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_urlstub_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_name_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_rel_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_relflags_txt(Types.text_general, true, true, true),
|
||||
inboundlinks_text_txt(Types.text_general, true, true, true),
|
||||
outboundlinkscount_i(Types.integer, true, true),
|
||||
outboundlinksnoindexcount_i(Types.integer, true, true),
|
||||
outboundlinks_tag_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_protocol_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_urlstub_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_name_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_rel_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_relflags_txt(Types.text_general, true, true, true),
|
||||
outboundlinks_text_txt(Types.text_general, true, true, true),
|
||||
charset_s(Types.string, true, true),
|
||||
lon_coordinate(Types.tdouble, true, false),
|
||||
lat_coordinate(Types.tdouble, true, false),
|
||||
httpstatus_i(Types.integer, true, true),
|
||||
h1_txt(Types.text_general, true, true, true),
|
||||
h2_txt(Types.text_general, true, true, true),
|
||||
h3_txt(Types.text_general, true, true, true),
|
||||
h4_txt(Types.text_general, true, true, true),
|
||||
h5_txt(Types.text_general, true, true, true),
|
||||
h6_txt(Types.text_general, true, true, true),
|
||||
htags_i(Types.integer, true, true),
|
||||
canonical_s(Types.string, true, true),
|
||||
robots_i(Types.integer, true, true),
|
||||
metagenerator_t(Types.text_general, true, true),
|
||||
boldcount_i(Types.integer, true, true),
|
||||
bold_txt(Types.text_general, true, true, true),
|
||||
bold_val(Types.integer, true, true, true),
|
||||
italiccount_i(Types.integer, true, true),
|
||||
italic_txt(Types.text_general, true, true, true),
|
||||
italic_val(Types.integer, true, true, true),
|
||||
licount_i(Types.integer, true, true),
|
||||
li_txt(Types.text_general, true, true, true),
|
||||
imagescount_i(Types.integer, true, true),
|
||||
images_tag_txt(Types.text_general, true, true, true),
|
||||
images_protocol_txt(Types.text_general, true, true, true),
|
||||
images_urlstub_txt(Types.text_general, true, true, true),
|
||||
images_alt_txt(Types.text_general, true, true, true),
|
||||
csscount_i(Types.integer, true, true),
|
||||
css_tag_txt(Types.text_general, true, true, true),
|
||||
css_url_txt(Types.text_general, true, true, true),
|
||||
scripts_txt(Types.text_general, true, true, true),
|
||||
scriptscount_i(Types.integer, true, true),
|
||||
frames_txt(Types.text_general, true, true, true),
|
||||
framesscount_i(Types.integer, true, true),
|
||||
iframes_txt(Types.text_general, true, true, true),
|
||||
iframesscount_i(Types.integer, true, true),
|
||||
flash_b(Types.bool, true, true),
|
||||
responsetime_i(Types.integer, true, true),
|
||||
|
||||
ext_cms_txt(Types.text_general, true, true, true),
|
||||
ext_cms_val(Types.integer, true, true, true),
|
||||
ext_ads_txt(Types.text_general, true, true, true),
|
||||
ext_ads_val(Types.integer, true, true, true),
|
||||
ext_community_txt(Types.text_general, true, true, true),
|
||||
ext_community_val(Types.integer, true, true, true),
|
||||
ext_maps_txt(Types.text_general, true, true, true),
|
||||
ext_maps_val(Types.integer, true, true, true),
|
||||
ext_tracker_txt(Types.text_general, true, true, true),
|
||||
ext_tracker_val(Types.integer, true, true, true),
|
||||
ext_title_txt(Types.text_general, true, true, true),
|
||||
ext_title_val(Types.integer, true, true, true),
|
||||
|
||||
failreason_t(Types.text_general, true, true);
|
||||
|
||||
final Types type;
|
||||
final boolean indexed, stored;
|
||||
boolean multiValued, omitNorms;
|
||||
|
||||
private Field(final Types type, final boolean indexed, final boolean stored) {
|
||||
this.type = type;
|
||||
this.indexed = indexed;
|
||||
this.stored = stored;
|
||||
this.multiValued = false;
|
||||
this.omitNorms = false;
|
||||
}
|
||||
|
||||
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued) {
|
||||
this(type, indexed, stored);
|
||||
this.multiValued = multiValued;
|
||||
}
|
||||
|
||||
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms) {
|
||||
this(type, indexed, stored, multiValued);
|
||||
this.omitNorms = omitNorms;
|
||||
}
|
||||
|
||||
public final Types getType() {
|
||||
return this.type;
|
||||
}
|
||||
|
||||
public final boolean isIndexed() {
|
||||
return this.indexed;
|
||||
}
|
||||
|
||||
public final boolean isStored() {
|
||||
return this.stored;
|
||||
}
|
||||
|
||||
public final boolean isMultiValued() {
|
||||
return this.multiValued;
|
||||
}
|
||||
|
||||
public final boolean isOmitNorms() {
|
||||
return this.omitNorms;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
|
||||
// we user the SolrCell design as index scheme
|
||||
final SolrInputDocument solrdoc = new SolrInputDocument();
|
||||
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
|
||||
addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
|
||||
addSolr(solrdoc, "id", id);
|
||||
addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
|
||||
addSolr(solrdoc, Field.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
|
||||
addSolr(solrdoc, Field.id, id);
|
||||
addSolr(solrdoc, Field.sku, digestURI.toNormalform(true, false), 3.0f);
|
||||
final InetAddress address = digestURI.getInetAddress();
|
||||
if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
|
||||
if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
|
||||
addSolr(solrdoc, "title", yacydoc.dc_title());
|
||||
addSolr(solrdoc, "author", yacydoc.dc_creator());
|
||||
addSolr(solrdoc, "description", yacydoc.dc_description());
|
||||
addSolr(solrdoc, "content_type", yacydoc.dc_format());
|
||||
addSolr(solrdoc, "last_modified", header.lastModified());
|
||||
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
|
||||
if (address != null) addSolr(solrdoc, Field.ip_s, address.getHostAddress());
|
||||
if (digestURI.getHost() != null) addSolr(solrdoc, Field.host_s, digestURI.getHost());
|
||||
addSolr(solrdoc, Field.title, yacydoc.dc_title());
|
||||
addSolr(solrdoc, Field.author, yacydoc.dc_creator());
|
||||
addSolr(solrdoc, Field.description, yacydoc.dc_description());
|
||||
addSolr(solrdoc, Field.content_type, yacydoc.dc_format());
|
||||
addSolr(solrdoc, Field.last_modified, header.lastModified());
|
||||
addSolr(solrdoc, Field.keywords, yacydoc.dc_subject(' '));
|
||||
final String content = UTF8.String(yacydoc.getTextBytes());
|
||||
addSolr(solrdoc, "text_t", content);
|
||||
if (isEmpty() || contains("wordcount_i")) {
|
||||
addSolr(solrdoc, Field.text_t, content);
|
||||
if (isEmpty() || contains(Field.wordcount_i.name())) {
|
||||
final int contentwc = content.split(" ").length;
|
||||
addSolr(solrdoc, "wordcount_i", contentwc);
|
||||
addSolr(solrdoc, Field.wordcount_i, contentwc);
|
||||
}
|
||||
|
||||
// path elements of link
|
||||
final String path = digestURI.getPath();
|
||||
if (path != null && (isEmpty() || contains("attr_paths"))) {
|
||||
if (path != null && (isEmpty() || contains(Field.paths_txt.name()))) {
|
||||
final String[] paths = path.split("/");
|
||||
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
|
||||
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
|
||||
}
|
||||
|
||||
// list all links
|
||||
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
|
||||
int c = 0;
|
||||
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
|
||||
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
|
||||
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
|
||||
if (isEmpty() || contains(Field.inboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.inboundlinksnoindexcount_i, yacydoc.inboundLinkNoindexCount());
|
||||
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
|
||||
|
@ -156,17 +324,17 @@ public class SolrScheme extends ConfigurationSet {
|
|||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
|
||||
if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
|
||||
if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
|
||||
if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
|
||||
if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
|
||||
if (isEmpty() || contains("attr_inboundlinks_relflags")) addSolr(solrdoc, "attr_inboundlinks_relflags", relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
|
||||
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
|
||||
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, inboundlinksURLProtocol);
|
||||
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
|
||||
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
|
||||
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
|
||||
|
||||
c = 0;
|
||||
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
|
||||
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
|
||||
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
|
||||
if (isEmpty() || contains(Field.outboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.outboundlinksnoindexcount_i, yacydoc.outboundLinkNoindexCount());
|
||||
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
|
||||
|
@ -193,24 +361,24 @@ public class SolrScheme extends ConfigurationSet {
|
|||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
|
||||
if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
|
||||
if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
|
||||
if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
|
||||
if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
|
||||
if (isEmpty() || contains("attr_outboundlinks_relflags")) addSolr(solrdoc, "attr_outboundlinks_relflags", relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
|
||||
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
|
||||
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, outboundlinksURLProtocol);
|
||||
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
|
||||
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
|
||||
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
|
||||
|
||||
|
||||
// charset
|
||||
addSolr(solrdoc, "charset_s", yacydoc.getCharset());
|
||||
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
|
||||
|
||||
// coordinates
|
||||
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
||||
addSolr(solrdoc, "lon_coordinate", yacydoc.lon());
|
||||
addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
|
||||
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
|
||||
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
|
||||
}
|
||||
addSolr(solrdoc, "httpstatus_i", 200);
|
||||
addSolr(solrdoc, Field.httpstatus_i, 200);
|
||||
final Object parser = yacydoc.getParserObject();
|
||||
if (parser instanceof ContentScraper) {
|
||||
final ContentScraper html = (ContentScraper) parser;
|
||||
|
@ -218,16 +386,19 @@ public class SolrScheme extends ConfigurationSet {
|
|||
// header tags
|
||||
int h = 0;
|
||||
int f = 1;
|
||||
for (int i = 1; i <= 6; i++) {
|
||||
final String[] hs = html.getHeadlines(i);
|
||||
h = h | (hs.length > 0 ? f : 0);
|
||||
f = f * 2;
|
||||
addSolr(solrdoc, "attr_h" + i, hs);
|
||||
}
|
||||
addSolr(solrdoc, "htags_i", h);
|
||||
String[] hs;
|
||||
|
||||
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h1_txt, hs);
|
||||
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h2_txt, hs);
|
||||
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h3_txt, hs);
|
||||
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h4_txt, hs);
|
||||
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h5_txt, hs);
|
||||
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h6_txt, hs);
|
||||
|
||||
addSolr(solrdoc, Field.htags_i, h);
|
||||
|
||||
// canonical tag
|
||||
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
|
||||
if (html.getCanonical() != null) addSolr(solrdoc, Field.canonical_s, html.getCanonical().toNormalform(false, false));
|
||||
|
||||
// noindex and nofollow attributes
|
||||
// from HTML (meta-tag in HTML header: robots)
|
||||
|
@ -261,32 +432,32 @@ public class SolrScheme extends ConfigurationSet {
|
|||
if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11
|
||||
if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
|
||||
}
|
||||
addSolr(solrdoc, "robots_i", b);
|
||||
addSolr(solrdoc, Field.robots_i, b);
|
||||
|
||||
// meta tags: generator
|
||||
final String generator = html.getMetas().get("generator");
|
||||
if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);
|
||||
if (generator != null) addSolr(solrdoc, Field.metagenerator_t, generator);
|
||||
|
||||
// bold, italic
|
||||
final String[] bold = html.getBold();
|
||||
addSolr(solrdoc, "boldcount_i", bold.length);
|
||||
addSolr(solrdoc, Field.boldcount_i, bold.length);
|
||||
if (bold.length > 0) {
|
||||
addSolr(solrdoc, "attr_bold", bold);
|
||||
if (isEmpty() || contains("attr_boldcount")) {
|
||||
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
|
||||
addSolr(solrdoc, Field.bold_txt, bold);
|
||||
if (isEmpty() || contains(Field.bold_val.name())) {
|
||||
addSolr(solrdoc, Field.bold_val, html.getBoldCount(bold));
|
||||
}
|
||||
}
|
||||
final String[] italic = html.getItalic();
|
||||
addSolr(solrdoc, "italiccount_i", italic.length);
|
||||
addSolr(solrdoc, Field.italiccount_i, italic.length);
|
||||
if (italic.length > 0) {
|
||||
addSolr(solrdoc, "attr_italic", italic);
|
||||
if (isEmpty() || contains("attr_italiccount")) {
|
||||
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
|
||||
addSolr(solrdoc, Field.italic_txt, italic);
|
||||
if (isEmpty() || contains(Field.italic_val.name())) {
|
||||
addSolr(solrdoc, Field.italic_val, html.getItalicCount(italic));
|
||||
}
|
||||
}
|
||||
final String[] li = html.getLi();
|
||||
addSolr(solrdoc, "licount_i", li.length);
|
||||
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
|
||||
addSolr(solrdoc, Field.licount_i, li.length);
|
||||
if (li.length > 0) addSolr(solrdoc, Field.li_txt, li);
|
||||
|
||||
// images
|
||||
final Collection<ImageEntry> imagesc = html.getImages().values();
|
||||
|
@ -303,14 +474,14 @@ public class SolrScheme extends ConfigurationSet {
|
|||
imgalts[c] = ie.alt();
|
||||
c++;
|
||||
}
|
||||
addSolr(solrdoc, "imagescount_i", imgtags.length);
|
||||
if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
|
||||
if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
|
||||
if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
|
||||
if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
|
||||
addSolr(solrdoc, Field.imagescount_i, imgtags.length);
|
||||
if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags);
|
||||
if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, imgprots);
|
||||
if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs);
|
||||
if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts);
|
||||
|
||||
// style sheets
|
||||
if (isEmpty() || contains("attr_css")) {
|
||||
if (isEmpty() || contains("css_txt")) {
|
||||
final Map<MultiProtocolURI, String> csss = html.getCSS();
|
||||
final String[] css_tag = new String[csss.size()];
|
||||
final String[] css_url = new String[csss.size()];
|
||||
|
@ -323,63 +494,64 @@ public class SolrScheme extends ConfigurationSet {
|
|||
css_url[c] = url;
|
||||
c++;
|
||||
}
|
||||
addSolr(solrdoc, "csscount_i", css_tag.length);
|
||||
if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
|
||||
if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
|
||||
addSolr(solrdoc, Field.csscount_i, css_tag.length);
|
||||
if (css_tag.length > 0) addSolr(solrdoc, Field.css_tag_txt, css_tag);
|
||||
if (css_url.length > 0) addSolr(solrdoc, Field.css_url_txt, css_url);
|
||||
}
|
||||
|
||||
// Scripts
|
||||
if (isEmpty() || contains("attr_scripts")) {
|
||||
if (isEmpty() || contains(Field.scripts_txt.name())) {
|
||||
final Set<MultiProtocolURI> scriptss = html.getScript();
|
||||
final String[] scripts = new String[scriptss.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI url: scriptss) {
|
||||
scripts[c++] = url.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, "scriptscount_i", scripts.length);
|
||||
if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
|
||||
addSolr(solrdoc, Field.scriptscount_i, scripts.length);
|
||||
if (scripts.length > 0) addSolr(solrdoc, Field.scripts_txt, scripts);
|
||||
}
|
||||
|
||||
// Frames
|
||||
if (isEmpty() || contains("attr_frames")) {
|
||||
if (isEmpty() || contains(Field.frames_txt.name())) {
|
||||
final Set<MultiProtocolURI> framess = html.getFrames();
|
||||
final String[] frames = new String[framess.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI entry: framess) {
|
||||
frames[c++] = entry.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, "framesscount_i", frames.length);
|
||||
if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
|
||||
addSolr(solrdoc, Field.framesscount_i, frames.length);
|
||||
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
|
||||
}
|
||||
|
||||
// IFrames
|
||||
if (isEmpty() || contains("attr_iframes")) {
|
||||
if (isEmpty() || contains(Field.iframes_txt.name()
|
||||
)) {
|
||||
final Set<MultiProtocolURI> iframess = html.getIFrames();
|
||||
final String[] iframes = new String[iframess.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI entry: iframess) {
|
||||
iframes[c++] = entry.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, "iframesscount_i", iframes.length);
|
||||
if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
|
||||
addSolr(solrdoc, Field.iframesscount_i, iframes.length);
|
||||
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
|
||||
}
|
||||
|
||||
// flash embedded
|
||||
addSolr(solrdoc, "flash_b", html.containsFlash());
|
||||
addSolr(solrdoc, Field.flash_b, html.containsFlash());
|
||||
|
||||
// generic evaluation pattern
|
||||
for (final String model: html.getEvaluationModelNames()) {
|
||||
if (isEmpty() || contains("attr_" + model)) {
|
||||
if (isEmpty() || contains("ext_" + model + "_txt")) {
|
||||
final String[] scorenames = html.getEvaluationModelScoreNames(model);
|
||||
if (scorenames.length > 0) {
|
||||
addSolr(solrdoc, "attr_" + model, scorenames);
|
||||
addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
|
||||
addSolr(solrdoc, Field.valueOf("ext_" + model + "_txt"), scorenames);
|
||||
addSolr(solrdoc, Field.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// response time
|
||||
addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
|
||||
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
|
||||
}
|
||||
return solrdoc;
|
||||
}
|
||||
|
|
|
@ -143,6 +143,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
public void pleaseStop() {
|
||||
this.shallRun = false;
|
||||
}
|
||||
@Override
|
||||
public void run() {
|
||||
while (this.shallRun) {
|
||||
if (SolrSingleConnector.this.transmissionQueue[this.idx].size() > 0) {
|
||||
|
@ -165,6 +166,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
for (int i = 0; i < transmissionQueueCount; i++) {
|
||||
if (this.transmissionWorker[i].isAlive()) {
|
||||
|
@ -204,6 +206,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
* delete everything in the solr index
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public void clear() throws IOException {
|
||||
try {
|
||||
this.server.deleteByQuery("*:*");
|
||||
|
@ -213,6 +216,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void delete(final String id) throws IOException {
|
||||
try {
|
||||
this.server.deleteById(id);
|
||||
|
@ -221,6 +225,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void delete(final List<String> ids) throws IOException {
|
||||
try {
|
||||
this.server.deleteById(ids);
|
||||
|
@ -229,6 +234,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean exists(final String id) throws IOException {
|
||||
try {
|
||||
final SolrDocumentList list = get("id:" + id, 0, 1);
|
||||
|
@ -254,10 +260,12 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException, SolrException {
|
||||
add(this.scheme.yacy2solr(id, header, doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
|
||||
int thisrrc = this.transmissionRoundRobinCounter;
|
||||
int nextrrc = thisrrc++;
|
||||
|
@ -284,11 +292,15 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
req.add( docs );
|
||||
UpdateResponse rsp = req.process( server );
|
||||
*/
|
||||
} catch (final SolrException e) {
|
||||
// the field is probably not known
|
||||
Log.logWarning("SolrConnector", e.getMessage());
|
||||
} catch (final Throwable e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
|
||||
|
||||
final SolrInputDocument solrdoc = new SolrInputDocument();
|
||||
|
@ -330,6 +342,7 @@ public class SolrSingleConnector implements SolrConnector {
|
|||
* @param querystring
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
|
||||
// construct query
|
||||
final SolrQuery query = new SolrQuery();
|
||||
|
|
Loading…
Reference in New Issue
Block a user