2013-02-21 13:23:55 +01:00
## this is a list of all solr keys for the default index 'collection1', the fulltext search index
## this complete list of keys can be changed; the actual schema is stored in:
## DATA/SETTINGS/solr.collection.schema
2011-06-29 17:33:27 +02:00
## the syntax of this file:
## - all lines beginning with '##' are comments
## - all non-empty lines not beginning with '#' are keyword lines
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
2012-08-21 23:52:56 +02:00
### mandatory values, do not disable them, YaCy won't work without them
2011-06-29 17:33:27 +02:00
2012-08-21 23:52:56 +02:00
## primary key of document, the URL hash, string (mandatory field)
2011-06-29 17:33:27 +02:00
id
2012-08-21 23:52:56 +02:00
##url of document, string (mandatory field)
sku
## last-modified from http header, date (mandatory field)
last_modified
2017-02-20 10:48:07 +01:00
## time when resource was loaded
load_date_dt
2012-08-21 23:52:56 +02:00
## mime-type of document, string (mandatory field)
content_type
## content of title tag, text (mandatory field)
title
## id of the host, a 6-byte hash that is part of the document id (mandatory field)
host_id_s
2017-02-20 10:48:07 +01:00
## host of the url, string
host_s
2012-08-21 23:52:56 +02:00
## the size of the raw source (mandatory field)
size_i
2013-05-06 16:45:54 +02:00
## fail reason if a page was not loaded. if the page was loaded then this field is empty, string (mandatory field)
failreason_s
2012-08-21 23:52:56 +02:00
2012-11-23 14:00:30 +01:00
## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
failtype_s
2012-08-21 23:52:56 +02:00
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
2017-02-20 10:48:07 +01:00
## the file name extension
url_file_ext_s
## either the second level domain or, if a ccSLD is used, the third level domain. Needed to search in the url
host_organization_s
## internal links, only the protocol. Needed for HostBrowser
inboundlinks_protocol_sxt
## internal links, the url only without the protocol. For correct assembly of inboundlinks inboundlinks_protocol_sxt + inboundlinks_urlstub_sxt is needed
inboundlinks_urlstub_sxt
## external links, only the protocol. For correct assembly of outboundlinks outboundlinks_protocol_sxt + outboundlinks_urlstub_sxt is needed
outboundlinks_protocol_sxt
## external links, the url only without the protocol. Needed to enhance the crawler
outboundlinks_urlstub_sxt
## all image links without the protocol and '://'. For correct assembly of image url images_protocol_sxt + images_urlstub_sxt is needed
images_urlstub_sxt
## all image link protocols
images_protocol_sxt
2017-02-24 11:08:18 +01:00
### optional but highly recommended values, part of the index distribution process
## date until resource shall be considered as fresh
fresh_date_dt
## id of the referrer to this document, discovered during crawling
referrer_id_s
## the name of the publisher of the document
publisher_t
## the language used in the document
language_s
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
### optional but highly recommended values, not part of the index distribution process
2017-02-20 10:48:07 +01:00
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b
#title_exact_signature_l
## flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false, boolean
#title_unique_b
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
#exact_signature_copycount_i
## intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies
#fuzzy_signature_text_t
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
#fuzzy_signature_copycount_i
2013-01-02 20:55:43 +01:00
## needed (post-)processing steps on this metadata set
2014-11-27 12:13:20 +01:00
#process_sxt
2013-01-02 20:55:43 +01:00
2017-02-24 11:08:18 +01:00
## if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances
dates_in_content_dts
2013-09-25 14:38:24 +02:00
2017-02-24 11:08:18 +01:00
## the number of entries in dates_in_content_sxt
dates_in_content_count_i
2013-01-02 20:55:43 +01:00
2017-02-24 11:08:18 +01:00
## content of itemprop attributes with content='startDate'
startDates_dts
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## content of itemprop attributes with content='endDate'
endDates_dts
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## number of unique http references, should be equal to references_internal_i + references_external_i
references_i
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## number of unique http references from same host to referenced url
references_internal_i
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## number of unique http references from external hosts
references_external_i
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## number of external hosts which provide http references
references_exthosts_i
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth
crawldepth_i
2012-08-21 23:52:56 +02:00
2017-02-24 11:08:18 +01:00
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
2012-08-21 23:52:56 +02:00
2017-02-21 22:59:11 +01:00
## unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false
http_unique_b
## unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false
www_unique_b
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t
exact_signature_l
## flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search
exact_signature_unique_b
## 64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t
fuzzy_signature_l
## flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search
fuzzy_signature_unique_b
2012-09-26 13:38:04 +02:00
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
2013-02-11 22:10:14 +01:00
## geospatial point in degrees of latitude,longitude as declared in WSG84, location; this creates two additional subfields, coordinate_p_0_coordinate (latitude) and coordinate_p_1_coordinate (longitude)
2012-08-27 14:41:33 +02:00
coordinate_p
2011-06-29 17:33:27 +02:00
## content of author-tag, texgen
author
2013-07-30 12:48:57 +02:00
## content of description-tag(s), text
description_txt
2011-06-29 17:33:27 +02:00
2013-04-16 01:35:15 +02:00
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b
#description_exact_signature_l
2014-07-07 19:15:11 +02:00
## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean
2013-01-21 18:02:29 +01:00
#description_unique_b
2012-08-29 16:11:23 +02:00
## content of keywords tag; words are separated by space
2011-06-29 17:33:27 +02:00
keywords
## character encoding, string
charset_s
2012-08-21 23:52:56 +02:00
## number of words in visible area, int
wordcount_i
2014-03-27 23:36:08 +01:00
## number of all outgoing links; including linksnofollowcount_i, int
linkscount_i
## number of all outgoing inks with nofollow tag, int
linksnofollowcount_i
## number of outgoing inbound (to same domain) links; including inboundlinksnofollowcount_i, int
2012-08-21 23:52:56 +02:00
inboundlinkscount_i
2014-03-27 23:36:08 +01:00
## number of outgoing inbound (to same domain) links with nofollow tag, int
2015-12-19 21:25:08 +01:00
#inboundlinksnofollowcount_i
2012-08-21 23:52:56 +02:00
2014-03-27 23:36:08 +01:00
## number of outgoing outbound (to other domain) links, including outboundlinksnofollowcount_i, int
2012-08-21 23:52:56 +02:00
outboundlinkscount_i
2014-03-27 23:36:08 +01:00
## number of outgoing outbound (to other domain) links with nofollow tag, int
2015-12-19 21:25:08 +01:00
#outboundlinksnofollowcount_i
2012-08-21 23:52:56 +02:00
## number of images, int
imagescount_i
## response time of target server in milliseconds, int
responsetime_i
## all visible text, text
text_t
2012-10-02 11:13:06 +02:00
## additional synonyms to the words in the text
synonyms_sxt
2012-10-02 00:02:50 +02:00
2012-08-29 16:11:23 +02:00
## h1 header
2012-08-28 16:58:06 +02:00
h1_txt
2012-08-29 16:11:23 +02:00
## h2 header
2012-08-28 16:58:06 +02:00
h2_txt
2012-08-29 16:11:23 +02:00
## h3 header
2012-08-28 16:58:06 +02:00
h3_txt
2012-08-29 16:11:23 +02:00
## h4 header
2012-08-28 16:58:06 +02:00
h4_txt
2012-08-29 16:11:23 +02:00
## h5 header
2012-08-28 16:58:06 +02:00
h5_txt
2012-08-29 16:11:23 +02:00
## h6 header
2012-08-28 16:58:06 +02:00
h6_txt
2017-02-21 22:59:11 +01:00
### unused, delete candidates
## the md5 of the raw source
#md5_s
## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s
2012-08-21 23:52:56 +02:00
### optional values, not part of standard YaCy handling (but useful for external applications)
2012-09-26 13:38:04 +02:00
## ip of host of url (after DNS lookup), string
#ip_s
2012-09-03 15:26:08 +02:00
2012-08-29 16:11:23 +02:00
## tags of css entries, normalized with absolute URL
2013-05-06 16:45:54 +02:00
#css_tag_sxt
2011-08-31 18:02:06 +02:00
2012-08-29 16:11:23 +02:00
## urls of css entries, normalized with absolute URL
2013-05-06 16:45:54 +02:00
#css_url_sxt
2011-06-29 17:33:27 +02:00
## number of css entries, int
2012-08-21 23:52:56 +02:00
#csscount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## urls of script entries, normalized with absolute URL
2013-05-06 16:45:54 +02:00
#scripts_sxt
2011-06-29 17:33:27 +02:00
2013-05-06 16:45:54 +02:00
## number of entries in scripts_sxt, int
2012-08-21 23:52:56 +02:00
#scriptscount_i
2011-06-29 17:33:27 +02:00
2013-07-02 14:29:13 +02:00
## noindex and nofollow attributes
## from HTML (meta-tag in HTML header: robots)
## and HTTP header (X-Robots-Tag property)
## coded as binary value:
2011-09-30 15:39:01 +02:00
## bit 0: "all" contained in html header meta
## bit 1: "index" contained in html header meta
2013-07-02 14:29:13 +02:00
## bit 2: "follow" contained in html header meta
## bit 3: "noindex" contained in html header meta
## bit 4: "nofollow" contained in html header meta
## bit 8: "all" contained in http header X-Robots-Tag
## bit 9: "noindex" contained in http header X-Robots-Tag
## bit 10: "nofollow" contained in http header X-Robots-Tag
## bit 11: "noarchive" contained in http header X-Robots-Tag
## bit 12: "nosnippet" contained in http header X-Robots-Tag
## bit 13: "noodp" contained in http header X-Robots-Tag
## bit 14: "notranslate" contained in http header X-Robots-Tag
## bit 15: "noimageindex" contained in http header X-Robots-Tag
## bit 16: "unavailable_after" contained in http header X-Robots-Tag
2012-08-21 23:52:56 +02:00
#robots_i
2011-06-29 17:33:27 +02:00
## content of <meta name="generator" content=#content#> tag, text
2012-08-21 23:52:56 +02:00
#metagenerator_t
2011-06-29 17:33:27 +02:00
2013-10-08 18:41:07 +02:00
## internal links, the visible anchor text
inboundlinks_anchortext_txt
## external links, the visible anchor text
outboundlinks_anchortext_txt
2016-02-02 09:57:54 +01:00
## all icon links without the protocol and '://'
icons_urlstub_sxt
## all icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored
icons_protocol_sxt
## all icon links relationships space separated (e.g. 'icon apple-touch-icon')
icons_rel_sxt
## all icon sizes space separated (e.g. '16x16 32x32')
icons_sizes_sxt
2013-06-18 13:28:30 +02:00
## all text/words appearing in image alt texts or the tokenized url
images_text_t
2011-08-31 18:02:06 +02:00
## all image link alt tag
2013-09-04 10:47:18 +02:00
images_alt_sxt
2013-06-18 13:28:30 +02:00
## size of images:height
images_height_val
## size of images:width
images_width_val
## size of images as number of pixels (easier for ranking than using with and height)
2014-01-10 10:26:45 +01:00
#images_pixel_val
2011-08-31 18:02:06 +02:00
2012-09-07 21:33:45 +02:00
## number of image links with alt tag
#images_withalt_i
2011-06-29 17:33:27 +02:00
## binary pattern for the existance of h1..h6 headlines, int
2012-08-21 23:52:56 +02:00
#htags_i
2011-06-29 17:33:27 +02:00
2011-07-01 18:38:01 +02:00
## url inside the canonical link element, string
2013-03-13 14:47:00 +01:00
#canonical_s
2011-07-01 18:38:01 +02:00
2013-01-21 18:02:29 +01:00
## flag shows if the url in canonical_t is equal to sku, boolean
#canonical_equal_sku_b
2012-06-28 13:27:45 +02:00
## link from the url property inside the refresh link element, string
2012-08-21 23:52:56 +02:00
#refresh_s
2012-06-28 13:27:45 +02:00
2012-08-29 16:11:23 +02:00
## all texts in <li> tags
2012-08-21 23:52:56 +02:00
#li_txt
2011-06-29 17:33:27 +02:00
## number of <li> tags, int
2012-08-21 23:52:56 +02:00
#licount_i
2011-06-29 17:33:27 +02:00
2015-04-12 22:02:45 +02:00
## all texts in <dt> tags
#dt_txt
## number of <dt> tags, int
#dtcount_i
## all texts in <dd> tags
#dd_txt
## number of <dd> tags, int
#ddcount_i
## all texts in <article> tags
#article_txt
## number of <article> tags, int
#articlecount_i
2012-08-29 16:11:23 +02:00
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
2012-01-13 11:25:15 +01:00
bold_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of occurrences of texts in bold_txt
2012-01-13 11:25:15 +01:00
#bold_val
2011-06-29 17:33:27 +02:00
## total number of occurrences of <b> or <strong>, int
2012-08-21 23:52:56 +02:00
#boldcount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order
2012-01-13 11:25:15 +01:00
italic_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of occurrences of texts in italic_txt
2012-01-13 11:25:15 +01:00
#italic_val
2011-06-29 17:33:27 +02:00
## total number of occurrences of <i>, int
2012-08-21 23:52:56 +02:00
#italiccount_i
2011-06-29 17:33:27 +02:00
2012-10-01 14:16:49 +02:00
## all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order
underline_txt
## number of occurrences of texts in underline_txt
#underline_val
## total number of occurrences of <u>, int
#underlinecount_i
2011-06-30 17:49:21 +02:00
## flag that shows if a swf file is linked, boolean
2012-08-21 23:52:56 +02:00
#flash_b
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## list of all links to frames
2013-03-13 14:47:00 +01:00
#frames_sxt
2011-06-29 17:33:27 +02:00
## number of attr_frames, int
2012-08-21 23:52:56 +02:00
#framesscount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## list of all links to iframes
2013-03-13 14:47:00 +01:00
#iframes_sxt
2011-06-29 17:33:27 +02:00
## number of attr_iframes, int
2012-08-21 23:52:56 +02:00
#iframesscount_i
2011-06-29 17:33:27 +02:00
2013-04-18 17:21:17 +02:00
## url of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
#hreflang_url_sxt
## country code of the hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
#hreflang_cc_sxt
## page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
#navigation_url_sxt
## page navigation rel property value, can contain one of {top,up,next,prev,first,last}
#navigation_type_sxt
## publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
#publisher_url_s
2012-08-29 16:11:23 +02:00
## the protocol of the url
2012-09-11 22:46:39 +02:00
url_protocol_s
2012-08-29 16:11:23 +02:00
2013-06-25 16:27:20 +02:00
## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension
url_file_name_s
2012-08-29 16:11:23 +02:00
2013-10-08 23:48:13 +02:00
## tokens generated from url_file_name_s which can be used for better matching and result boosting
2014-01-10 10:26:45 +01:00
#url_file_name_tokens_t
2013-10-08 23:48:13 +02:00
2014-10-13 23:51:19 +02:00
## number of all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name
url_paths_count_i
2013-06-25 16:27:20 +02:00
## all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name
url_paths_sxt
2012-08-29 16:11:23 +02:00
## number of key-value pairs in search part of the url
#url_parameter_i
## the keys from key-value pairs in the search part of the url
#url_parameter_key_sxt
## the values from key-value pairs in the search part of the url
#url_parameter_value_sxt
## number of all characters in the url == length of sku field
2012-12-02 16:53:02 +01:00
url_chars_i
2012-08-29 16:11:23 +02:00
2012-08-28 16:58:06 +02:00
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used.
#host_dnc_s
## the organization and dnc concatenated with '.'
#host_organizationdnc_s
## the remaining part of the host without organizationdnc
#host_subdomain_s
2013-04-14 20:52:40 +02:00
## number of documents from the same host; can be used to measure references_internal_i for likelihood computation, integer
host_extent_i
2012-08-31 10:30:43 +02:00
## number of titles (counting the 'title' field) in the document
#title_count_i
## number of characters for each title
#title_chars_val
## number of words in each title
#title_words_val
## number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)
#description_count_i
## number of characters for each description
#description_chars_val
## number of words in each description
#description_words_val
2012-09-04 14:11:11 +02:00
## number of h1..h6 header lines
#h1_i
#h2_i
#h3_i
#h4_i
#h5_i
#h6_i
2012-10-09 13:02:43 +02:00
## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
#schema_org_breadcrumb_i
2012-10-09 17:28:48 +02:00
## Open Graph Metadata field, see http://ogp.me/ns#
#opengraph_title_t
#opengraph_type_s
#opengraph_url_s
#opengraph_image_s
2013-06-07 13:20:57 +02:00
## citation ranking
## the number of documents within a single host
2013-12-04 17:48:12 +01:00
#cr_host_count_i
2013-06-07 13:20:57 +02:00
## the chance to click on this page when randomly clicking on links within on one host
2013-12-04 17:48:12 +01:00
#cr_host_chance_d
2013-06-07 13:20:57 +02:00
## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10
2013-12-04 17:48:12 +01:00
#cr_host_norm_i
2013-10-08 23:48:13 +02:00
## custom rating; to be set with external rating information
2014-01-10 10:26:45 +01:00
#rating_i
2013-06-07 13:20:57 +02:00
2012-08-29 16:11:23 +02:00
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
2012-01-13 11:25:15 +01:00
#ext_cms_txt
2011-06-29 17:33:27 +02:00
2012-08-31 10:30:43 +02:00
## number of attributes that count for a specific cms in attr_cms
2012-01-13 11:25:15 +01:00
#ext_cms_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of ad-servers/ad-services
2012-01-13 11:25:15 +01:00
#ext_ads_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attributes counts in attr_ads
2012-01-13 11:25:15 +01:00
#ext_ads_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of recognized community functions
2012-01-13 11:25:15 +01:00
#ext_community_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_community
2012-01-13 11:25:15 +01:00
#ext_community_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of map services
2012-01-13 11:25:15 +01:00
#ext_maps_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_maps
2012-01-13 11:25:15 +01:00
#ext_maps_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of tracker server
2012-01-13 11:25:15 +01:00
#ext_tracker_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_tracker
2012-01-13 11:25:15 +01:00
#ext_tracker_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names matching title expressions
2012-01-13 11:25:15 +01:00
#ext_title_txt
2011-08-31 18:02:06 +02:00
2012-08-29 16:11:23 +02:00
## number of matching title expressions
2012-01-13 11:25:15 +01:00
#ext_title_val
2014-11-18 15:02:34 +01:00
## collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies
vocabularies_sxt