2012-05-09 16:46:45 +02:00
/ * *
* SolrField
* Copyright 2011 by Michael Peter Christen
* First released 14 . 04 . 2011 at http : //yacy.net
*
* $LastChangedDate : 2011 - 04 - 14 22 : 05 : 04 + 0200 ( Do , 14 Apr 2011 ) $
* $LastChangedRevision : 7654 $
* $LastChangedBy : orbiter $
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.search.index ;
2012-08-23 09:51:45 +02:00
import java.util.Date ;
import java.util.List ;
2012-08-10 06:47:13 +02:00
import net.yacy.cora.services.federated.solr.Schema ;
2012-05-09 16:46:45 +02:00
import net.yacy.cora.services.federated.solr.SolrType ;
2012-08-27 14:41:33 +02:00
import org.apache.solr.common.SolrInputDocument ;
2012-08-10 06:47:13 +02:00
public enum YaCySchema implements Schema {
2012-05-09 16:46:45 +02:00
2012-08-21 23:52:56 +02:00
// mandatory
2012-08-28 16:58:06 +02:00
id ( SolrType . string , true , true , false , " primary key of document, the URL hash **mandatory field** " ) ,
2012-05-09 16:46:45 +02:00
sku ( SolrType . text_en_splitting_tight , true , true , false , true , " url of document " ) ,
2012-08-28 16:58:06 +02:00
last_modified ( SolrType . date , true , true , false , " last-modified from http header " ) ,
2012-08-21 23:52:56 +02:00
content_type ( SolrType . string , true , true , true , " mime-type of document " ) ,
2012-05-09 16:46:45 +02:00
title ( SolrType . text_general , true , true , true , " content of title tag " ) ,
2012-08-28 16:58:06 +02:00
host_id_s ( SolrType . string , true , true , false , " id of the host, a 6-byte hash that is part of the document id " ) , // String hosthash();
md5_s ( SolrType . string , true , true , false , " the md5 of the raw source " ) , // String md5();
size_i ( SolrType . integer , true , true , false , " the size of the raw source " ) , // int size();
process_s ( SolrType . string , true , true , false , " index creation comment " ) ,
failreason_t ( SolrType . text_general , true , true , false , " fail reason if a page was not loaded. if the page was loaded then this field is empty " ) ,
httpstatus_i ( SolrType . integer , true , true , false , " html status return code (i.e. \" 200 \" for ok), -1 if not loaded " ) ,
httpstatus_redirect_s ( SolrType . integer , true , true , false , " html status return code (i.e. \" 200 \" for ok), -1 if not loaded " ) ,
2012-08-27 14:41:33 +02:00
2012-08-21 23:52:56 +02:00
// optional but recommended, part of index distribution
2012-08-28 16:58:06 +02:00
load_date_dt ( SolrType . date , true , true , false , " time when resource was loaded " ) ,
fresh_date_dt ( SolrType . date , true , true , false , " date until resource shall be considered as fresh " ) ,
2012-08-21 23:52:56 +02:00
referrer_id_txt ( SolrType . string , true , true , true , " ids of referrer to this document " ) , // byte[] referrerHash();
2012-08-28 16:58:06 +02:00
publisher_t ( SolrType . text_general , true , true , false , " the name of the publisher of the document " ) , // String dc_publisher();
language_s ( SolrType . string , true , true , false , " the language used in the document " ) , // byte[] language();
audiolinkscount_i ( SolrType . integer , true , true , false , " number of links to audio resources " ) , // int laudio();
videolinkscount_i ( SolrType . integer , true , true , false , " number of links to video resources " ) , // int lvideo();
applinkscount_i ( SolrType . integer , true , true , false , " number of links to application resources " ) , // int lapp();
2012-08-27 14:41:33 +02:00
2012-08-21 23:52:56 +02:00
// optional but recommended
2012-08-28 16:58:06 +02:00
coordinate_p ( SolrType . location , true , true , false , " point in degrees of latitude,longitude as declared in WSG84 " ) ,
ip_s ( SolrType . string , true , true , false , " ip of host of url (after DNS lookup) " ) ,
author ( SolrType . text_general , true , true , false , " content of author-tag " ) ,
description ( SolrType . text_general , true , true , false , " content of description-tag " ) ,
keywords ( SolrType . text_general , true , true , false , " content of keywords tag; words are separated by space " ) ,
charset_s ( SolrType . string , true , true , false , " character encoding " ) ,
wordcount_i ( SolrType . integer , true , true , false , " number of words in visible area " ) ,
inboundlinkscount_i ( SolrType . integer , true , true , false , " total number of inbound links " ) ,
inboundlinksnofollowcount_i ( SolrType . integer , true , true , false , " number of inbound links with nofollow tag " ) ,
outboundlinkscount_i ( SolrType . integer , true , true , false , " external number of inbound links " ) ,
outboundlinksnofollowcount_i ( SolrType . integer , true , true , false , " number of external links with nofollow tag " ) ,
imagescount_i ( SolrType . integer , true , true , false , " number of images " ) ,
responsetime_i ( SolrType . integer , true , true , false , " response time of target server in milliseconds " ) ,
text_t ( SolrType . text_general , true , true , false , " all visible text " ) ,
h1_txt ( SolrType . text_general , true , true , true , " h1 header " ) ,
h2_txt ( SolrType . text_general , true , true , true , " h2 header " ) ,
h3_txt ( SolrType . text_general , true , true , true , " h3 header " ) ,
h4_txt ( SolrType . text_general , true , true , true , " h4 header " ) ,
h5_txt ( SolrType . text_general , true , true , true , " h5 header " ) ,
h6_txt ( SolrType . text_general , true , true , true , " h6 header " ) ,
2012-08-27 14:41:33 +02:00
// optional values
2012-08-28 16:58:06 +02:00
csscount_i ( SolrType . integer , true , true , false , " number of entries in css_tag_txt and css_url_txt " ) ,
2012-08-21 23:52:56 +02:00
css_tag_txt ( SolrType . text_general , true , true , true , " full css tag with normalized url " ) ,
css_url_txt ( SolrType . text_general , true , true , true , " normalized urls within a css tag " ) ,
scripts_txt ( SolrType . text_general , true , true , true , " normalized urls within a scripts tag " ) ,
2012-08-28 16:58:06 +02:00
scriptscount_i ( SolrType . integer , true , true , false , " number of entries in scripts_txt " ) ,
2012-05-09 16:46:45 +02:00
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
2012-08-28 16:58:06 +02:00
robots_i ( SolrType . integer , true , true , false , " content of <meta name= \" robots \" content=#content#> tag and the \" X-Robots-Tag \" HTTP property " ) ,
metagenerator_t ( SolrType . text_general , true , true , false , " content of <meta name= \" generator \" content=#content#> tag " ) ,
2012-05-09 16:46:45 +02:00
inboundlinks_tag_txt ( SolrType . text_general , true , true , true , " internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow " ) ,
2012-08-28 16:58:06 +02:00
inboundlinks_protocol_sxt ( SolrType . string , true , true , true , " internal links, only the protocol " ) ,
2012-05-09 16:46:45 +02:00
inboundlinks_urlstub_txt ( SolrType . text_general , true , true , true , " internal links, the url only without the protocol " ) ,
inboundlinks_name_txt ( SolrType . text_general , true , true , true , " internal links, the name property of the a-tag " ) ,
2012-08-28 16:58:06 +02:00
inboundlinks_rel_sxt ( SolrType . string , true , true , true , " internal links, the rel property of the a-tag " ) ,
inboundlinks_relflags_sxt ( SolrType . string , true , true , true , " internal links, the rel property of the a-tag, coded binary " ) ,
2012-05-09 16:46:45 +02:00
inboundlinks_text_txt ( SolrType . text_general , true , true , true , " internal links, the text content of the a-tag " ) ,
outboundlinks_tag_txt ( SolrType . text_general , true , true , true , " external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow " ) ,
2012-08-28 16:58:06 +02:00
outboundlinks_protocol_sxt ( SolrType . string , true , true , true , " external links, only the protocol " ) ,
2012-05-09 16:46:45 +02:00
outboundlinks_urlstub_txt ( SolrType . text_general , true , true , true , " external links, the url only without the protocol " ) ,
outboundlinks_name_txt ( SolrType . text_general , true , true , true , " external links, the name property of the a-tag " ) ,
2012-08-28 16:58:06 +02:00
outboundlinks_rel_sxt ( SolrType . string , true , true , true , " external links, the rel property of the a-tag " ) ,
outboundlinks_relflags_sxt ( SolrType . string , true , true , true , " external links, the rel property of the a-tag, coded binary " ) ,
2012-05-09 16:46:45 +02:00
outboundlinks_text_txt ( SolrType . text_general , true , true , true , " external links, the text content of the a-tag " ) ,
2012-08-21 23:52:56 +02:00
images_tag_txt ( SolrType . text_general , true , true , true , " all image tags, encoded as <img> tag inclusive alt- and title property " ) ,
images_urlstub_txt ( SolrType . text_general , true , true , true , " all image links without the protocol and '://' " ) ,
2012-08-28 16:58:06 +02:00
images_protocol_sxt ( SolrType . text_general , true , true , true , " all image link protocols " ) ,
2012-08-21 23:52:56 +02:00
images_alt_txt ( SolrType . text_general , true , true , true , " all image link alt tag " ) ,
2012-08-28 16:58:06 +02:00
htags_i ( SolrType . integer , true , true , false , " binary pattern for the existance of h1..h6 headlines " ) ,
2012-08-21 23:52:56 +02:00
paths_txt ( SolrType . text_general , true , true , true , " all path elements in the url " ) ,
2012-08-28 16:58:06 +02:00
canonical_t ( SolrType . text_general , true , true , false , " url inside the canonical link element " ) ,
refresh_s ( SolrType . string , true , true , false , " link from the url property inside the refresh link element " ) ,
2012-08-21 23:52:56 +02:00
li_txt ( SolrType . text_general , true , true , true , " all texts in <li> tags " ) ,
2012-08-28 16:58:06 +02:00
licount_i ( SolrType . integer , true , true , false , " number of <li> tags " ) ,
2012-05-09 16:46:45 +02:00
bold_txt ( SolrType . text_general , true , true , true , " all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order " ) ,
2012-08-28 16:58:06 +02:00
boldcount_i ( SolrType . integer , true , true , false , " total number of occurrences of <b> or <strong> " ) ,
2012-05-09 16:46:45 +02:00
italic_txt ( SolrType . text_general , true , true , true , " all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order " ) ,
2012-08-28 16:58:06 +02:00
italiccount_i ( SolrType . integer , true , true , false , " total number of occurrences of <i> " ) ,
flash_b ( SolrType . bool , true , true , false , " flag that shows if a swf file is linked " ) ,
2012-05-09 16:46:45 +02:00
frames_txt ( SolrType . text_general , true , true , true , " list of all links to frames " ) ,
2012-08-28 16:58:06 +02:00
framesscount_i ( SolrType . integer , true , true , false , " number of frames_txt " ) ,
2012-05-09 16:46:45 +02:00
iframes_txt ( SolrType . text_general , true , true , true , " list of all links to iframes " ) ,
2012-08-28 16:58:06 +02:00
iframesscount_i ( SolrType . integer , true , true , false , " number of iframes_txt " ) ,
host_s ( SolrType . string , true , true , false , " host of the url " ) ,
host_protocol_s ( SolrType . string , true , true , false , " the protocol of the url " ) ,
host_dnc_s ( SolrType . string , true , true , false , " the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used. " ) ,
host_organization_s ( SolrType . string , true , true , false , " either the second level domain or, if a ccSLD is used, the third level domain " ) ,
host_organizationdnc_s ( SolrType . string , true , true , false , " the organization and dnc concatenated with '.' " ) ,
host_subdomain_s ( SolrType . string , true , true , false , " the remaining part of the host without organizationdnc " ) ,
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val ( SolrType . integer , true , true , true , " number of occurrences of texts in bold_txt " ) ,
italic_val ( SolrType . integer , true , true , true , " number of occurrences of texts in italic_txt " ) ,
2012-05-09 16:46:45 +02:00
ext_cms_txt ( SolrType . text_general , true , true , true , " names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias " ) ,
ext_cms_val ( SolrType . integer , true , true , true , " number of attributes that count for a specific cms in ext_cms_txt " ) ,
ext_ads_txt ( SolrType . text_general , true , true , true , " names of ad-servers/ad-services " ) ,
ext_ads_val ( SolrType . integer , true , true , true , " number of attributes counts in ext_ads_txt " ) ,
ext_community_txt ( SolrType . text_general , true , true , true , " names of recognized community functions " ) ,
ext_community_val ( SolrType . integer , true , true , true , " number of attribute counts in attr_community " ) ,
ext_maps_txt ( SolrType . text_general , true , true , true , " names of map services " ) ,
ext_maps_val ( SolrType . integer , true , true , true , " number of attribute counts in ext_maps_txt " ) ,
ext_tracker_txt ( SolrType . text_general , true , true , true , " names of tracker server " ) ,
ext_tracker_val ( SolrType . integer , true , true , true , " number of attribute counts in ext_tracker_txt " ) ,
ext_title_txt ( SolrType . text_general , true , true , true , " names matching title expressions " ) ,
2012-08-21 23:52:56 +02:00
ext_title_val ( SolrType . integer , true , true , true , " number of matching title expressions " ) ;
2012-08-27 14:41:33 +02:00
2012-05-15 22:34:02 +02:00
private String solrFieldName = null ; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type ;
private final boolean indexed , stored ;
private boolean multiValued , omitNorms ;
private String comment ;
2012-05-09 16:46:45 +02:00
2012-08-28 16:58:06 +02:00
private YaCySchema ( final SolrType type , final boolean indexed , final boolean stored , final boolean multiValued , final String comment ) {
2012-05-09 16:46:45 +02:00
this . type = type ;
this . indexed = indexed ;
this . stored = stored ;
2012-08-28 16:58:06 +02:00
this . multiValued = multiValued ;
2012-05-09 16:46:45 +02:00
this . omitNorms = false ;
this . comment = comment ;
2012-08-28 16:58:06 +02:00
assert type . appropriateName ( this . name ( ) , this . multiValued ) : " bad configuration: " + this . name ( ) ;
2012-05-09 16:46:45 +02:00
}
2012-08-10 06:47:13 +02:00
private YaCySchema ( final SolrType type , final boolean indexed , final boolean stored , final boolean multiValued , final boolean omitNorms , final String comment ) {
2012-05-09 16:46:45 +02:00
this ( type , indexed , stored , multiValued , comment ) ;
this . omitNorms = omitNorms ;
2012-08-28 16:58:06 +02:00
assert type . appropriateName ( this . name ( ) , this . multiValued ) : " bad configuration: " + this . name ( ) ;
2012-05-09 16:46:45 +02:00
}
2012-05-15 22:34:02 +02:00
/ * *
* Returns the YaCy default or ( if available ) custom field name for Solr
* @return SolrFieldname String
* /
2012-06-28 13:27:45 +02:00
@Override
2012-05-15 22:34:02 +02:00
public final String getSolrFieldName ( ) {
return ( this . solrFieldName = = null ? this . name ( ) : this . solrFieldName ) ;
}
/ * *
* Set a custom Solr field name ( and converts it to lower case )
* @param theValue = the field name
* /
public final void setSolrFieldName ( String theValue ) {
// make sure no empty string is assigned
if ( ( theValue ! = null ) & & ( ! theValue . isEmpty ( ) ) ) {
this . solrFieldName = theValue . toLowerCase ( ) ;
} else {
this . solrFieldName = null ;
}
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final SolrType getType ( ) {
return this . type ;
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final boolean isIndexed ( ) {
return this . indexed ;
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final boolean isStored ( ) {
return this . stored ;
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final boolean isMultiValued ( ) {
return this . multiValued ;
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final boolean isOmitNorms ( ) {
return this . omitNorms ;
}
2012-06-28 13:27:45 +02:00
@Override
2012-05-09 16:46:45 +02:00
public final String getComment ( ) {
return this . comment ;
2012-08-27 14:41:33 +02:00
}
2012-08-23 09:51:45 +02:00
public final void add ( final SolrInputDocument doc , final String value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final Date value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final int value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final long value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final String [ ] value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final List < String > value ) {
doc . setField ( this . getSolrFieldName ( ) , value . toArray ( new String [ value . size ( ) ] ) ) ;
}
public final void add ( final SolrInputDocument doc , final float value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final double value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final boolean value ) {
doc . setField ( this . getSolrFieldName ( ) , value ) ;
}
public final void add ( final SolrInputDocument doc , final String value , final float boost ) {
doc . setField ( this . getSolrFieldName ( ) , value , boost ) ;
}
2012-05-09 16:46:45 +02:00
}