2011-06-29 17:33:27 +02:00
## this is a list of all solr keys
## solr can be used as alternative index target, solr is NOT the primary indexing system of YaCy
## this complete list of keys can be reduced:
## reduced list of keys can be placed in DATA/SETTINGS/solr.keys.<profile>.list
## where they can be used as profiles for solr index transport
## the syntax of this file:
## - all lines beginning with '##' are comments
## - all non-empty lines not beginning with '#' are keyword lines
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
2012-08-21 23:52:56 +02:00
### mandatory values, do not disable them, YaCy won't work without them
2011-06-29 17:33:27 +02:00
2012-08-21 23:52:56 +02:00
## primary key of document, the URL hash, string (mandatory field)
2011-06-29 17:33:27 +02:00
id
2012-08-21 23:52:56 +02:00
##url of document, string (mandatory field)
sku
## last-modified from http header, date (mandatory field)
last_modified
## mime-type of document, string (mandatory field)
content_type
## content of title tag, text (mandatory field)
title
## id of the host, a 6-byte hash that is part of the document id (mandatory field)
host_id_s
## the md5 of the raw source (mandatory field)
md5_s
## the size of the raw source (mandatory field)
size_i
## index creation comment (mandatory field)
process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
2012-08-28 16:58:06 +02:00
## html status return code (i.e. \"200\" for ok), -1 if not loaded
#httpstatus_redirect_s
2012-08-21 23:52:56 +02:00
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## ids of referrer to this document
referrer_id_txt
## the name of the publisher of the document
publisher_t
2012-08-28 16:58:06 +02:00
## the language used in the document
language_s
2012-08-21 23:52:56 +02:00
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
### optional but highly recommended values, not part of the index distribution process
2012-09-26 13:38:04 +02:00
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
2012-08-27 14:41:33 +02:00
## point in degrees of latitude,longitude as declared in WSG84, location
coordinate_p
2011-06-29 17:33:27 +02:00
## content of author-tag, texgen
author
## content of description-tag, text
description
2012-08-29 16:11:23 +02:00
## content of keywords tag; words are separated by space
2011-06-29 17:33:27 +02:00
keywords
## character encoding, string
charset_s
2012-08-21 23:52:56 +02:00
## number of words in visible area, int
wordcount_i
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with nofollow tag, int
inboundlinksnofollowcount_i
## external number of inbound links, int
outboundlinkscount_i
## number of external links with nofollow tag, int
outboundlinksnofollowcount_i
## number of images, int
imagescount_i
## response time of target server in milliseconds, int
responsetime_i
## all visible text, text
text_t
2012-10-02 11:13:06 +02:00
## additional synonyms to the words in the text
synonyms_sxt
2012-10-02 00:02:50 +02:00
2012-08-29 16:11:23 +02:00
## h1 header
2012-08-28 16:58:06 +02:00
h1_txt
2012-08-29 16:11:23 +02:00
## h2 header
2012-08-28 16:58:06 +02:00
h2_txt
2012-08-29 16:11:23 +02:00
## h3 header
2012-08-28 16:58:06 +02:00
h3_txt
2012-08-29 16:11:23 +02:00
## h4 header
2012-08-28 16:58:06 +02:00
h4_txt
2012-08-29 16:11:23 +02:00
## h5 header
2012-08-28 16:58:06 +02:00
h5_txt
2012-08-29 16:11:23 +02:00
## h6 header
2012-08-28 16:58:06 +02:00
h6_txt
2012-08-21 23:52:56 +02:00
### optional values, not part of standard YaCy handling (but useful for external applications)
2012-09-26 13:38:04 +02:00
## ip of host of url (after DNS lookup), string
#ip_s
2012-09-03 15:26:08 +02:00
2012-08-29 16:11:23 +02:00
## tags of css entries, normalized with absolute URL
2012-08-21 23:52:56 +02:00
#css_tag_txt
2011-08-31 18:02:06 +02:00
2012-08-29 16:11:23 +02:00
## urls of css entries, normalized with absolute URL
2012-08-21 23:52:56 +02:00
#css_url_txt
2011-06-29 17:33:27 +02:00
## number of css entries, int
2012-08-21 23:52:56 +02:00
#csscount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## urls of script entries, normalized with absolute URL
2012-08-21 23:52:56 +02:00
#scripts_txt
2011-06-29 17:33:27 +02:00
## number of script entries, int
2012-08-21 23:52:56 +02:00
#scriptscount_i
2011-06-29 17:33:27 +02:00
2011-09-30 15:39:01 +02:00
## encoded as binary value into an integer:
## bit 0: "all" contained in html header meta
## bit 1: "index" contained in html header meta
## bit 2: "noindex" contained in html header meta
## bit 3: "nofollow" contained in html header meta
## bit 8: "noarchive" contained in http header properties
## bit 9: "nosnippet" contained in http header properties
## bit 10: "noindex" contained in http header properties
## bit 11: "nofollow" contained in http header properties
## bit 12: "unavailable_after" contained in http header properties
## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
2012-08-21 23:52:56 +02:00
#robots_i
2011-06-29 17:33:27 +02:00
## content of <meta name="generator" content=#content#> tag, text
2012-08-21 23:52:56 +02:00
#metagenerator_t
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow
2012-08-21 23:52:56 +02:00
#inboundlinks_tag_txt
2011-08-31 18:02:06 +02:00
## internal links, only the protocol
2012-09-28 22:45:16 +02:00
inboundlinks_protocol_sxt
2011-08-31 18:02:06 +02:00
## internal links, the url only without the protocol
2012-09-28 22:45:16 +02:00
inboundlinks_urlstub_txt
2011-08-31 18:02:06 +02:00
## internal links, the name property of the a-tag
2012-01-13 11:25:15 +01:00
#inboundlinks_name_txt
2011-08-31 18:02:06 +02:00
## internal links, the rel property of the a-tag
2012-08-28 16:58:06 +02:00
#inboundlinks_rel_sxt
2011-08-31 18:02:06 +02:00
## internal links, the rel property of the a-tag, coded binary
2012-08-31 10:30:43 +02:00
#inboundlinks_relflags_val
2011-08-31 18:02:06 +02:00
## internal links, the text content of the a-tag
2012-01-13 11:25:15 +01:00
#inboundlinks_text_txt
2011-06-29 17:33:27 +02:00
2012-09-07 22:06:51 +02:00
## internal links, the length of the a-tag as number of characters
#inboundlinks_text_chars_val
## internal links, the length of the a-tag as number of words
#inboundlinks_text_words_val
##if the link is an image link, this contains the alt tag if the image is also liked as img link
#inboundlinks_alttag_txt
2012-08-29 16:11:23 +02:00
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow
2012-08-21 23:52:56 +02:00
#outboundlinks_tag_txt
2011-06-29 17:33:27 +02:00
2011-08-31 18:02:06 +02:00
## external links, only the protocol
2012-09-28 22:45:16 +02:00
outboundlinks_protocol_sxt
2011-08-31 18:02:06 +02:00
## external links, the url only without the protocol
2012-09-28 22:45:16 +02:00
outboundlinks_urlstub_txt
2011-08-31 18:02:06 +02:00
## external links, the name property of the a-tag
2012-01-13 11:25:15 +01:00
#outboundlinks_name_txt
2011-08-31 18:02:06 +02:00
## external links, the rel property of the a-tag
2012-08-28 16:58:06 +02:00
#outboundlinks_rel_sxt
2011-08-31 18:02:06 +02:00
2011-09-26 23:42:28 +02:00
## external links, the rel property of the a-tag, coded binary
2012-08-31 10:30:43 +02:00
#outboundlinks_relflags_val
2011-09-26 23:42:28 +02:00
2011-08-31 18:02:06 +02:00
## external links, the text content of the a-tag
2012-01-13 11:25:15 +01:00
#outboundlinks_text_txt
2011-08-31 18:02:06 +02:00
2012-09-07 22:06:51 +02:00
## external links, the length of the a-tag as number of characters
#outboundlinks_text_chars_val
## external links, the length of the a-tag as number of words
#outboundlinks_text_words_val
##if the link is an image link, this contains the alt tag if the image is also liked as img link
#outboundlinks_alttag_txt
2012-08-29 16:11:23 +02:00
## all image tags, encoded as <img> tag inclusive alt- and title property
2012-08-21 23:52:56 +02:00
#images_tag_txt
2011-08-31 18:02:06 +02:00
## all image links without the protocol and '://'
2012-01-13 11:25:15 +01:00
#images_urlstub_txt
2011-08-31 18:02:06 +02:00
## all image link protocols
2012-08-28 16:58:06 +02:00
#images_protocol_sxt
2011-08-31 18:02:06 +02:00
## all image link alt tag
2012-01-13 11:25:15 +01:00
#images_alt_txt
2011-08-31 18:02:06 +02:00
2012-09-07 21:33:45 +02:00
## number of image links with alt tag
#images_withalt_i
2011-06-29 17:33:27 +02:00
## binary pattern for the existance of h1..h6 headlines, int
2012-08-21 23:52:56 +02:00
#htags_i
2011-06-29 17:33:27 +02:00
2011-07-01 18:38:01 +02:00
## url inside the canonical link element, string
2012-08-28 16:58:06 +02:00
#canonical_t
2011-07-01 18:38:01 +02:00
2012-06-28 13:27:45 +02:00
## link from the url property inside the refresh link element, string
2012-08-21 23:52:56 +02:00
#refresh_s
2012-06-28 13:27:45 +02:00
2012-08-29 16:11:23 +02:00
## all texts in <li> tags
2012-08-21 23:52:56 +02:00
#li_txt
2011-06-29 17:33:27 +02:00
## number of <li> tags, int
2012-08-21 23:52:56 +02:00
#licount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
2012-01-13 11:25:15 +01:00
bold_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of occurrences of texts in bold_txt
2012-01-13 11:25:15 +01:00
#bold_val
2011-06-29 17:33:27 +02:00
## total number of occurrences of <b> or <strong>, int
2012-08-21 23:52:56 +02:00
#boldcount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order
2012-01-13 11:25:15 +01:00
italic_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of occurrences of texts in italic_txt
2012-01-13 11:25:15 +01:00
#italic_val
2011-06-29 17:33:27 +02:00
## total number of occurrences of <i>, int
2012-08-21 23:52:56 +02:00
#italiccount_i
2011-06-29 17:33:27 +02:00
2012-10-01 14:16:49 +02:00
## all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order
underline_txt
## number of occurrences of texts in underline_txt
#underline_val
## total number of occurrences of <u>, int
#underlinecount_i
2011-06-30 17:49:21 +02:00
## flag that shows if a swf file is linked, boolean
2012-08-21 23:52:56 +02:00
#flash_b
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## list of all links to frames
2012-08-21 23:52:56 +02:00
#frames_txt
2011-06-29 17:33:27 +02:00
## number of attr_frames, int
2012-08-21 23:52:56 +02:00
#framesscount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## list of all links to iframes
2012-08-21 23:52:56 +02:00
#iframes_txt
2011-06-29 17:33:27 +02:00
## number of attr_iframes, int
2012-08-21 23:52:56 +02:00
#iframesscount_i
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## the protocol of the url
2012-09-11 22:46:39 +02:00
url_protocol_s
2012-08-29 16:11:23 +02:00
## all path elements in the url
2012-09-11 20:15:54 +02:00
url_paths_sxt
2012-08-29 16:11:23 +02:00
2012-09-11 22:46:39 +02:00
## the file name extension
url_file_ext_s
2012-08-29 16:11:23 +02:00
## number of key-value pairs in search part of the url
#url_parameter_i
## the keys from key-value pairs in the search part of the url
#url_parameter_key_sxt
## the values from key-value pairs in the search part of the url
#url_parameter_value_sxt
## number of all characters in the url == length of sku field
#url_chars_i
2012-08-28 16:58:06 +02:00
## host of the url, string
2012-09-11 20:15:54 +02:00
host_s
2012-08-28 16:58:06 +02:00
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used.
#host_dnc_s
## either the second level domain or, if a ccSLD is used, the third level domain
2012-09-28 22:45:16 +02:00
host_organization_s
2012-08-28 16:58:06 +02:00
## the organization and dnc concatenated with '.'
#host_organizationdnc_s
## the remaining part of the host without organizationdnc
#host_subdomain_s
2012-08-31 10:30:43 +02:00
## number of titles (counting the 'title' field) in the document
#title_count_i
## number of characters for each title
#title_chars_val
## number of words in each title
#title_words_val
## number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)
#description_count_i
## number of characters for each description
#description_chars_val
## number of words in each description
#description_words_val
2012-09-04 14:11:11 +02:00
## number of h1..h6 header lines
#h1_i
#h2_i
#h3_i
#h4_i
#h5_i
#h6_i
2012-10-09 13:02:43 +02:00
## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
#schema_org_breadcrumb_i
2012-10-09 17:28:48 +02:00
## Open Graph Metadata field, see http://ogp.me/ns#
#opengraph_title_t
#opengraph_type_s
#opengraph_url_s
#opengraph_image_s
2012-08-29 16:11:23 +02:00
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
2012-01-13 11:25:15 +01:00
#ext_cms_txt
2011-06-29 17:33:27 +02:00
2012-08-31 10:30:43 +02:00
## number of attributes that count for a specific cms in attr_cms
2012-01-13 11:25:15 +01:00
#ext_cms_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of ad-servers/ad-services
2012-01-13 11:25:15 +01:00
#ext_ads_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attributes counts in attr_ads
2012-01-13 11:25:15 +01:00
#ext_ads_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of recognized community functions
2012-01-13 11:25:15 +01:00
#ext_community_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_community
2012-01-13 11:25:15 +01:00
#ext_community_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of map services
2012-01-13 11:25:15 +01:00
#ext_maps_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_maps
2012-01-13 11:25:15 +01:00
#ext_maps_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names of tracker server
2012-01-13 11:25:15 +01:00
#ext_tracker_txt
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## number of attribute counts in attr_tracker
2012-01-13 11:25:15 +01:00
#ext_tracker_val
2011-06-29 17:33:27 +02:00
2012-08-29 16:11:23 +02:00
## names matching title expressions
2012-01-13 11:25:15 +01:00
#ext_title_txt
2011-08-31 18:02:06 +02:00
2012-08-29 16:11:23 +02:00
## number of matching title expressions
2012-01-13 11:25:15 +01:00
#ext_title_val