yacy_search_server/defaults/solr.keys.list

320 lines
7.8 KiB
Plaintext
Raw Normal View History

## this is a list of all solr keys
## solr can be used as alternative index target, solr is NOT the primary indexing system of YaCy
## this complete list of keys can be reduced:
## reduced list of keys can be placed in DATA/SETTINGS/solr.keys.<profile>.list
## where they can be used as profiles for solr index transport
## the syntax of this file:
## - all lines beginning with '##' are comments
## - all non-empty lines not beginning with '#' are keyword lines
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
### mandatory values, do not disable them, YaCy won't work without them
## primary key of document, the URL hash, string (mandatory field)
id
##url of document, string (mandatory field)
sku
## last-modified from http header, date (mandatory field)
last_modified
## mime-type of document, string (mandatory field)
content_type
## content of title tag, text (mandatory field)
title
## id of the host, a 6-byte hash that is part of the document id (mandatory field)
host_id_s
## the md5 of the raw source (mandatory field)
md5_s
## the size of the raw source (mandatory field)
size_i
## index creation comment (mandatory field)
process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## ids of referrer to this document
referrer_id_txt
## the name of the publisher of the document
publisher_t
## the language used in the document; starts with primary language
language_txt
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
### optional but highly recommended values, not part of the index distribution process
## longitude of location as declared in WSG84, tdouble
lon_coordinate
## longitude of location as declared in WSG84, tdouble
lat_coordinate
## point in degrees of latitude,longitude as declared in WSG84, location
coordinate_p
## ip of host of url (after DNS lookup), string
ip_s
## content of author-tag, texgen
author
## content of description-tag, text
description
## content of keywords tag; words are separated by space, textgen
keywords
## character encoding, string
charset_s
## number of words in visible area, int
wordcount_i
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with nofollow tag, int
inboundlinksnofollowcount_i
## external number of inbound links, int
outboundlinkscount_i
## number of external links with nofollow tag, int
outboundlinksnofollowcount_i
## number of images, int
imagescount_i
## response time of target server in milliseconds, int
responsetime_i
## all visible text, text
text_t
### optional values, not part of standard YaCy handling (but useful for external applications)
## tags of css entries, normalized with absolute URL, textgen
#css_tag_txt
## urls of css entries, normalized with absolute URL, textgen
#css_url_txt
## number of css entries, int
#csscount_i
## urls of script entries, normalized with absolute URL, textgen
#scripts_txt
## number of script entries, int
#scriptscount_i
## encoded as binary value into an integer:
## bit 0: "all" contained in html header meta
## bit 1: "index" contained in html header meta
## bit 2: "noindex" contained in html header meta
## bit 3: "nofollow" contained in html header meta
## bit 8: "noarchive" contained in http header properties
## bit 9: "nosnippet" contained in http header properties
## bit 10: "noindex" contained in http header properties
## bit 11: "nofollow" contained in http header properties
## bit 12: "unavailable_after" contained in http header properties
## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
#robots_i
## content of <meta name="generator" content=#content#> tag, text
#metagenerator_t
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
#inboundlinks_tag_txt
## internal links, only the protocol
#inboundlinks_protocol_txt
## internal links, the url only without the protocol
#inboundlinks_urlstub_txt
## internal links, the name property of the a-tag
#inboundlinks_name_txt
## internal links, the rel property of the a-tag
#inboundlinks_rel_txt
## internal links, the rel property of the a-tag, coded binary
#inboundlinks_relflags_txt
## internal links, the text content of the a-tag
#inboundlinks_text_txt
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
#outboundlinks_tag_txt
## external links, only the protocol
#outboundlinks_protocol_txt
## external links, the url only without the protocol
#outboundlinks_urlstub_txt
## external links, the name property of the a-tag
#outboundlinks_name_txt
## external links, the rel property of the a-tag
#outboundlinks_rel_txt
## external links, the rel property of the a-tag, coded binary
#outboundlinks_relflags_txt
## external links, the text content of the a-tag
#outboundlinks_text_txt
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
#images_tag_txt
## all image links without the protocol and '://'
#images_urlstub_txt
## all image link protocols
#images_protocol_txt
## all image link alt tag
#images_alt_txt
## h1 header, textgen
h1_txt
## h2 header, textgen
h2_txt
## h3 header, textgen
#h3_txt
## h4 header, textgen
#h4_txt
## h5 header, textgen
#h5_txt
## h6 header, textgen
#h6_txt
## binary pattern for the existance of h1..h6 headlines, int
#htags_i
## all path elements in the url, textgen
#paths_txt
## host of the url, string
#host_s
## url inside the canonical link element, string
#canonical_s
## link from the url property inside the refresh link element, string
#refresh_s
## all texts in <li> tags, textgen
#li_txt
## number of <li> tags, int
#licount_i
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
bold_txt
## number of occurrences of texts in bold_txt, textgen
#bold_val
## total number of occurrences of <b> or <strong>, int
#boldcount_i
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
italic_txt
## number of occurrences of texts in italic_txt, textgen
#italic_val
## total number of occurrences of <i>, int
#italiccount_i
## flag that shows if a swf file is linked, boolean
#flash_b
## list of all links to frames, textgen
#frames_txt
## number of attr_frames, int
#framesscount_i
## list of all links to iframes, textgen
#iframes_txt
## number of attr_iframes, int
#iframesscount_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
#ext_cms_txt
##number of attributes that count for a specific cms in attr_cms, textgen
#ext_cms_val
## names of ad-servers/ad-services, textgen
#ext_ads_txt
## number of attributes counts in attr_ads, textgen
#ext_ads_val
## names of recognized community functions, textgen
#ext_community_txt
## number of attribute counts in attr_community, textgen
#ext_community_val
## names of map services, textgen
#ext_maps_txt
## number of attribute counts in attr_maps, textgen
#ext_maps_val
## names of tracker server, textgen
#ext_tracker_txt
## number of attribute counts in attr_tracker, textgen
#ext_tracker_val
## names matching title expressions, textgen
#ext_title_txt
## number of matching title expressions, textgen
#ext_title_val