mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added a solr type definition verifier
- fixed type definition found by the verifier - added multivalue-string fields for solr with extension 'sxt' - added multivalue-integer fields for solr with extension 'val' - renamed some solr attributes from txt to sxt - changed solr query line to an explicit AND/OR structure - added a country code second level domain list to Domains class; with parser - added a host string parser to get domain class name, country-code second-level domain and subdomain out of it - removed old coordinate attributes
This commit is contained in:
parent
4521d63c92
commit
316b5fe116
|
@ -44,6 +44,9 @@ failreason_t
|
|||
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
|
||||
httpstatus_i
|
||||
|
||||
## html status return code (i.e. \"200\" for ok), -1 if not loaded
|
||||
#httpstatus_redirect_s
|
||||
|
||||
|
||||
### optional but highly recommended values, part of the index distribution process
|
||||
|
||||
|
@ -59,8 +62,8 @@ referrer_id_txt
|
|||
## the name of the publisher of the document
|
||||
publisher_t
|
||||
|
||||
## the language used in the document; starts with primary language
|
||||
language_txt
|
||||
## the language used in the document
|
||||
language_s
|
||||
|
||||
## number of links to audio resources
|
||||
audiolinkscount_i
|
||||
|
@ -74,12 +77,6 @@ applinkscount_i
|
|||
|
||||
### optional but highly recommended values, not part of the index distribution process
|
||||
|
||||
## longitude of location as declared in WSG84, tdouble
|
||||
lon_coordinate
|
||||
|
||||
## longitude of location as declared in WSG84, tdouble
|
||||
lat_coordinate
|
||||
|
||||
## point in degrees of latitude,longitude as declared in WSG84, location
|
||||
coordinate_p
|
||||
|
||||
|
@ -122,6 +119,24 @@ responsetime_i
|
|||
## all visible text, text
|
||||
text_t
|
||||
|
||||
## h1 header, textgen
|
||||
h1_txt
|
||||
|
||||
## h2 header, textgen
|
||||
h2_txt
|
||||
|
||||
## h3 header, textgen
|
||||
h3_txt
|
||||
|
||||
## h4 header, textgen
|
||||
h4_txt
|
||||
|
||||
## h5 header, textgen
|
||||
h5_txt
|
||||
|
||||
## h6 header, textgen
|
||||
h6_txt
|
||||
|
||||
|
||||
### optional values, not part of standard YaCy handling (but useful for external applications)
|
||||
|
||||
|
@ -160,7 +175,7 @@ text_t
|
|||
#inboundlinks_tag_txt
|
||||
|
||||
## internal links, only the protocol
|
||||
#inboundlinks_protocol_txt
|
||||
#inboundlinks_protocol_sxt
|
||||
|
||||
## internal links, the url only without the protocol
|
||||
#inboundlinks_urlstub_txt
|
||||
|
@ -169,10 +184,10 @@ text_t
|
|||
#inboundlinks_name_txt
|
||||
|
||||
## internal links, the rel property of the a-tag
|
||||
#inboundlinks_rel_txt
|
||||
#inboundlinks_rel_sxt
|
||||
|
||||
## internal links, the rel property of the a-tag, coded binary
|
||||
#inboundlinks_relflags_txt
|
||||
#inboundlinks_relflags_sxt
|
||||
|
||||
## internal links, the text content of the a-tag
|
||||
#inboundlinks_text_txt
|
||||
|
@ -181,7 +196,7 @@ text_t
|
|||
#outboundlinks_tag_txt
|
||||
|
||||
## external links, only the protocol
|
||||
#outboundlinks_protocol_txt
|
||||
#outboundlinks_protocol_sxt
|
||||
|
||||
## external links, the url only without the protocol
|
||||
#outboundlinks_urlstub_txt
|
||||
|
@ -190,10 +205,10 @@ text_t
|
|||
#outboundlinks_name_txt
|
||||
|
||||
## external links, the rel property of the a-tag
|
||||
#outboundlinks_rel_txt
|
||||
#outboundlinks_rel_sxt
|
||||
|
||||
## external links, the rel property of the a-tag, coded binary
|
||||
#outboundlinks_relflags_txt
|
||||
#outboundlinks_relflags_sxt
|
||||
|
||||
## external links, the text content of the a-tag
|
||||
#outboundlinks_text_txt
|
||||
|
@ -205,40 +220,19 @@ text_t
|
|||
#images_urlstub_txt
|
||||
|
||||
## all image link protocols
|
||||
#images_protocol_txt
|
||||
#images_protocol_sxt
|
||||
|
||||
## all image link alt tag
|
||||
#images_alt_txt
|
||||
|
||||
## h1 header, textgen
|
||||
h1_txt
|
||||
|
||||
## h2 header, textgen
|
||||
h2_txt
|
||||
|
||||
## h3 header, textgen
|
||||
#h3_txt
|
||||
|
||||
## h4 header, textgen
|
||||
#h4_txt
|
||||
|
||||
## h5 header, textgen
|
||||
#h5_txt
|
||||
|
||||
## h6 header, textgen
|
||||
#h6_txt
|
||||
|
||||
## binary pattern for the existance of h1..h6 headlines, int
|
||||
#htags_i
|
||||
|
||||
## all path elements in the url, textgen
|
||||
#paths_txt
|
||||
|
||||
## host of the url, string
|
||||
#host_s
|
||||
|
||||
## url inside the canonical link element, string
|
||||
#canonical_s
|
||||
#canonical_t
|
||||
|
||||
## link from the url property inside the refresh link element, string
|
||||
#refresh_s
|
||||
|
@ -282,6 +276,24 @@ italic_txt
|
|||
## number of attr_iframes, int
|
||||
#iframesscount_i
|
||||
|
||||
## host of the url, string
|
||||
#host_s
|
||||
|
||||
## the protocol of the url
|
||||
#host_protocol_s
|
||||
|
||||
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used.
|
||||
#host_dnc_s
|
||||
|
||||
## either the second level domain or, if a ccSLD is used, the third level domain
|
||||
#host_organization_s
|
||||
|
||||
## the organization and dnc concatenated with '.'
|
||||
#host_organizationdnc_s
|
||||
|
||||
## the remaining part of the host without organizationdnc
|
||||
#host_subdomain_s
|
||||
|
||||
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
|
||||
#ext_cms_txt
|
||||
|
||||
|
|
|
@ -154,9 +154,9 @@
|
|||
a "*" only at the start or the end. -->
|
||||
|
||||
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/>
|
||||
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
|
||||
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
|
||||
<dynamicField name="*_val" type="int" indexed="true" stored="true" multiValued="true"/> <!-- YaCy special -->
|
||||
<dynamicField name="*_sxt" type="string" indexed="true" stored="true" multiValued="true"/> <!-- YaCy special -->
|
||||
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/>
|
||||
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
|
||||
|
|
|
@ -38,6 +38,22 @@ public class schema_p {
|
|||
|
||||
// write scheme
|
||||
int c = 0;
|
||||
/*
|
||||
//<field name="#[solrname]#" type="#[type]#"#(indexedChecked)#:: indexed="true"#(/indexedChecked)##(storedChecked)#:: stored="true"#(/storedChecked)##(multiValuedChecked)#:: multiValued="true"#(/multiValuedChecked)##(omitNormsChecked)#:: omitNorms="true"#(/omitNormsChecked)#/>
|
||||
if (sb == null) {
|
||||
for (SolrType type : SolrType.values()) {
|
||||
prop.put("fields_" + c + "_solrname", field.getSolrFieldName());
|
||||
prop.put("fields_" + c + "_type", field.getType().printName());
|
||||
prop.put("fields_" + c + "_comment", field.getComment());
|
||||
prop.put("fields_" + c + "_indexedChecked", field.isIndexed() ? 1 : 0);
|
||||
prop.put("fields_" + c + "_storedChecked", field.isStored() ? 1 : 0);
|
||||
prop.put("fields_" + c + "_multiValuedChecked", field.isMultiValued() ? 1 : 0);
|
||||
prop.put("fields_" + c + "_omitNormsChecked", field.isOmitNorms() ? 1 : 0);
|
||||
c++;
|
||||
}
|
||||
prop.put("fields", c);
|
||||
} else {
|
||||
*/
|
||||
SolrConfiguration solrScheme = sb.index.fulltext().getSolrScheme();
|
||||
for (YaCySchema field : YaCySchema.values()) {
|
||||
if (solrScheme.contains(field.name())) {
|
||||
|
@ -52,6 +68,7 @@ public class schema_p {
|
|||
}
|
||||
}
|
||||
prop.put("fields", c);
|
||||
//}
|
||||
|
||||
prop.put("solruniquekey",YaCySchema.id.getSolrFieldName());
|
||||
prop.put("solrdefaultsearchfield",YaCySchema.text_t.getSolrFieldName());
|
||||
|
|
|
@ -68,14 +68,7 @@ public class Domains {
|
|||
|
||||
private static Class<?> InetAddressLocatorClass;
|
||||
private static Method InetAddressLocatorGetLocaleInetAddressMethod;
|
||||
|
||||
static {
|
||||
// using http://javainetlocator.sourceforge.net/ if library is present
|
||||
// we use this class using reflection to be able to remove it because that class is old and without maintenancy
|
||||
InetAddressLocatorClass = ClassProvider.load("net.sf.javainetlocator.InetAddressLocator", new File("lib/InetAddressLocator.jar"));
|
||||
InetAddressLocatorGetLocaleInetAddressMethod = ClassProvider.getStaticMethod(InetAddressLocatorClass, "getLocale", new Class[]{InetAddress.class});
|
||||
}
|
||||
|
||||
private static final Set<String> ccSLD_TLD = new HashSet<String>();
|
||||
private static final String PRESENT = "";
|
||||
private static final String LOCAL_PATTERNS = "10\\..*,127\\..*,172\\.(1[6-9]|2[0-9]|3[0-1])\\..*,169\\.254\\..*,192\\.168\\..*,localhost";
|
||||
private static final int MAX_NAME_CACHE_HIT_SIZE = 100000;
|
||||
|
@ -410,6 +403,153 @@ public class Domains {
|
|||
"GOPHER=OpenNIC",
|
||||
"MICRO=OpenNIC"
|
||||
};
|
||||
private static final String[] ccSLD_TLD_list = new String[] { "com.ac", "net.ac", "gov.ac", "org.ac", "mil.ac", "co.ae",
|
||||
"net.ae", "gov.ae", "ac.ae", "sch.ae", "org.ae", "mil.ae", "pro.ae", "name.ae", "com.af", "edu.af", "gov.af", "net.af", "org.af",
|
||||
"com.al", "edu.al", "gov.al", "mil.al", "net.al", "org.al", "ed.ao", "gv.ao", "og.ao", "co.ao", "pb.ao", "it.ao", "com.ar", "edu.ar",
|
||||
"gob.ar", "gov.ar", "int.ar", "mil.ar", "net.ar", "org.ar", "tur.ar", "gv.at", "ac.at", "co.at", "or.at", "com.au", "net.au", "org.au",
|
||||
"edu.au", "gov.au", "csiro.au", "asn.au", "id.au", "org.ba", "net.ba", "edu.ba", "gov.ba", "mil.ba", "unsa.ba", "untz.ba", "unmo.ba",
|
||||
"unbi.ba", "unze.ba", "co.ba", "com.ba", "rs.ba", "co.bb", "com.bb", "net.bb", "org.bb", "gov.bb", "edu.bb", "info.bb", "store.bb",
|
||||
"tv.bb", "biz.bb", "com.bh", "info.bh", "cc.bh", "edu.bh", "biz.bh", "net.bh", "org.bh", "gov.bh", "com.bn", "edu.bn", "gov.bn",
|
||||
"net.bn", "org.bn", "com.bo", "net.bo", "org.bo", "tv.bo", "mil.bo", "int.bo", "gob.bo", "gov.bo", "edu.bo", "adm.br", "adv.br",
|
||||
"agr.br", "am.br", "arq.br", "art.br", "ato.br", "b.br", "bio.br", "blog.br", "bmd.br", "cim.br", "cng.br", "cnt.br", "com.br",
|
||||
"coop.br", "ecn.br", "edu.br", "eng.br", "esp.br", "etc.br", "eti.br", "far.br", "flog.br", "fm.br", "fnd.br", "fot.br", "fst.br",
|
||||
"g12.br", "ggf.br", "gov.br", "imb.br", "ind.br", "inf.br", "jor.br", "jus.br", "lel.br", "mat.br", "med.br", "mil.br", "mus.br",
|
||||
"net.br", "nom.br", "not.br", "ntr.br", "odo.br", "org.br", "ppg.br", "pro.br", "psc.br", "psi.br", "qsl.br", "rec.br", "slg.br",
|
||||
"srv.br", "tmp.br", "trd.br", "tur.br", "tv.br", "vet.br", "vlog.br", "wiki.br", "zlg.br", "com.bs", "net.bs", "org.bs", "edu.bs",
|
||||
"gov.bs", "com.bz", "edu.bz", "gov.bz", "net.bz", "org.bz", "om.bz", "du.bz", "ov.bz", "et.bz", "rg.bz", "ab.ca", "bc.ca", "mb.ca",
|
||||
"nb.ca", "nf.ca", "nl.ca", "ns.ca", "nt.ca", "nu.ca", "on.ca", "pe.ca", "qc.ca", "sk.ca", "yk.ca", "co.ck", "org.ck", "edu.ck", "gov.ck",
|
||||
"net.ck", "gen.ck", "biz.ck", "info.ck", "ac.cn", "com.cn", "edu.cn", "gov.cn", "mil.cn", "net.cn", "org.cn", "ah.cn", "bj.cn", "cq.cn",
|
||||
"fj.cn", "gd.cn", "gs.cn", "gz.cn", "gx.cn", "ha.cn", "hb.cn", "he.cn", "hi.cn", "hl.cn", "hn.cn", "jl.cn", "js.cn", "jx.cn", "ln.cn",
|
||||
"nm.cn", "nx.cn", "qh.cn", "sc.cn", "sd.cn", "sh.cn", "sn.cn", "sx.cn", "tj.cn", "tw.cn", "xj.cn", "xz.cn", "yn.cn", "zj.cn", "com.co",
|
||||
"org.co", "edu.co", "gov.co", "net.co", "mil.co", "nom.co", "ac.cr", "co.cr", "ed.cr", "fi.cr", "go.cr", "or.cr", "sa.cr", "cr", "ac.cy",
|
||||
"net.cy", "gov.cy", "org.cy", "pro.cy", "name.cy", "ekloges.cy", "tm.cy", "ltd.cy", "biz.cy", "press.cy", "parliament.cy", "com.cy",
|
||||
"edu.do", "gob.do", "gov.do", "com.do", "sld.do", "org.do", "net.do", "web.do", "mil.do", "art.do", "com.dz", "org.dz", "net.dz",
|
||||
"gov.dz", "edu.dz", "asso.dz", "pol.dz", "art.dz", "com.ec", "info.ec", "net.ec", "fin.ec", "med.ec", "pro.ec", "org.ec", "edu.ec",
|
||||
"gov.ec", "mil.ec", "com.eg", "edu.eg", "eun.eg", "gov.eg", "mil.eg", "name.eg", "net.eg", "org.eg", "sci.eg", "com.er", "edu.er",
|
||||
"gov.er", "mil.er", "net.er", "org.er", "ind.er", "rochest.er", "w.er", "com.es", "nom.es", "org.es", "gob.es", "edu.es", "com.et",
|
||||
"gov.et", "org.et", "edu.et", "net.et", "biz.et", "name.et", "info.et", "ac.fj", "biz.fj", "com.fj", "info.fj", "mil.fj", "name.fj",
|
||||
"net.fj", "org.fj", "pro.fj", "co.fk", "org.fk", "gov.fk", "ac.fk", "nom.fk", "net.fk", "fr", "tm.fr", "asso.fr", "nom.fr", "prd.fr",
|
||||
"presse.fr", "com.fr", "gouv.fr", "co.gg", "net.gg", "org.gg", "com.gh", "edu.gh", "gov.gh", "org.gh", "mil.gh", "com.gn", "ac.gn",
|
||||
"gov.gn", "org.gn", "net.gn", "com.gr", "edu.gr", "net.gr", "org.gr", "gov.gr", "mil.gr", "com.gt", "edu.gt", "net.gt", "gob.gt",
|
||||
"org.gt", "mil.gt", "ind.gt", "com.gu", "net.gu", "gov.gu", "org.gu", "edu.gu", "com.hk", "edu.hk", "gov.hk", "idv.hk", "net.hk",
|
||||
"org.hk", "ac.id", "co.id", "net.id", "or.id", "web.id", "sch.id", "mil.id", "go.id", "war.net.id", "ac.il", "co.il", "org.il", "net.il",
|
||||
"k12.il", "gov.il", "muni.il", "idf.il", "in", "co.in", "firm.in", "net.in", "org.in", "gen.in", "ind.in", "ac.in", "edu.in", "res.in",
|
||||
"ernet.in", "gov.in", "mil.in", "nic.in", "iq", "gov.iq", "edu.iq", "com.iq", "mil.iq", "org.iq", "net.iq", "ir", "ac.ir", "co.ir",
|
||||
"gov.ir", "id.ir", "net.ir", "org.ir", "sch.ir", "dnssec.ir", "gov.it", "edu.it", "co.je", "net.je", "org.je", "com.jo", "net.jo",
|
||||
"gov.jo", "edu.jo", "org.jo", "mil.jo", "name.jo", "sch.jo", "ac.jp", "ad.jp", "co.jp", "ed.jp", "go.jp", "gr.jp", "lg.jp", "ne.jp",
|
||||
"or.jp", "co.ke", "or.ke", "ne.ke", "go.ke", "ac.ke", "sc.ke", "me.ke", "mobi.ke", "info.ke", "per.kh", "com.kh", "edu.kh", "gov.kh",
|
||||
"mil.kh", "net.kh", "org.kh", "com.ki", "biz.ki", "de.ki", "net.ki", "info.ki", "org.ki", "gov.ki", "edu.ki", "mob.ki", "tel.ki", "km",
|
||||
"com.km", "coop.km", "asso.km", "nom.km", "presse.km", "tm.km", "medecin.km", "notaires.km", "pharmaciens.km", "veterinaire.km",
|
||||
"edu.km", "gouv.km", "mil.km", "net.kn", "org.kn", "edu.kn", "gov.kn", "kr", "co.kr", "ne.kr", "or.kr", "re.kr", "pe.kr", "go.kr",
|
||||
"mil.kr", "ac.kr", "hs.kr", "ms.kr", "es.kr", "sc.kr", "kg.kr", "seoul.kr", "busan.kr", "daegu.kr", "incheon.kr", "gwangju.kr",
|
||||
"daejeon.kr", "ulsan.kr", "gyeonggi.kr", "gangwon.kr", "chungbuk.kr", "chungnam.kr", "jeonbuk.kr", "jeonnam.kr", "gyeongbuk.kr",
|
||||
"gyeongnam.kr", "jeju.kr", "edu.kw", "com.kw", "net.kw", "org.kw", "gov.kw", "com.ky", "org.ky", "net.ky", "edu.ky", "gov.ky", "com.kz",
|
||||
"edu.kz", "gov.kz", "mil.kz", "net.kz", "org.kz", "com.lb", "edu.lb", "gov.lb", "net.lb", "org.lb", "gov.lk", "sch.lk", "net.lk",
|
||||
"int.lk", "com.lk", "org.lk", "edu.lk", "ngo.lk", "soc.lk", "web.lk", "ltd.lk", "assn.lk", "grp.lk", "hotel.lk", "com.lr", "edu.lr",
|
||||
"gov.lr", "org.lr", "net.lr", "com.lv", "edu.lv", "gov.lv", "org.lv", "mil.lv", "id.lv", "net.lv", "asn.lv", "conf.lv", "com.ly",
|
||||
"net.ly", "gov.ly", "plc.ly", "edu.ly", "sch.ly", "med.ly", "org.ly", "id.ly", "ma", "net.ma", "ac.ma", "org.ma", "gov.ma", "press.ma",
|
||||
"co.ma", "tm.mc", "asso.mc", "co.me", "net.me", "org.me", "edu.me", "ac.me", "gov.me", "its.me", "priv.me", "org.mg", "nom.mg", "gov.mg",
|
||||
"prd.mg", "tm.mg", "edu.mg", "mil.mg", "com.mg", "com.mk", "org.mk", "net.mk", "edu.mk", "gov.mk", "inf.mk", "name.mk", "pro.mk",
|
||||
"com.ml", "net.ml", "org.ml", "edu.ml", "gov.ml", "presse.ml", "gov.mn", "edu.mn", "org.mn", "com.mo", "edu.mo", "gov.mo", "net.mo",
|
||||
"org.mo", "com.mt", "org.mt", "net.mt", "edu.mt", "gov.mt", "aero.mv", "biz.mv", "com.mv", "coop.mv", "edu.mv", "gov.mv", "info.mv",
|
||||
"int.mv", "mil.mv", "museum.mv", "name.mv", "net.mv", "org.mv", "pro.mv", "ac.mw", "co.mw", "com.mw", "coop.mw", "edu.mw", "gov.mw",
|
||||
"int.mw", "museum.mw", "net.mw", "org.mw", "com.mx", "net.mx", "org.mx", "edu.mx", "gob.mx", "com.my", "net.my", "org.my", "gov.my",
|
||||
"edu.my", "sch.my", "mil.my", "name.my", "com.nf", "net.nf", "arts.nf", "store.nf", "web.nf", "firm.nf", "info.nf", "other.nf", "per.nf",
|
||||
"rec.nf", "com.ng", "org.ng", "gov.ng", "edu.ng", "net.ng", "sch.ng", "name.ng", "mobi.ng", "biz.ng", "mil.ng", "gob.ni", "co.ni",
|
||||
"com.ni", "ac.ni", "edu.ni", "org.ni", "nom.ni", "net.ni", "mil.ni", "com.np", "edu.np", "gov.np", "org.np", "mil.np", "net.np",
|
||||
"edu.nr", "gov.nr", "biz.nr", "info.nr", "net.nr", "org.nr", "com.nr", "com.om", "co.om", "edu.om", "ac.om", "sch.om", "gov.om",
|
||||
"net.om", "org.om", "mil.om", "museum.om", "biz.om", "pro.om", "med.om", "edu.pe", "gob.pe", "nom.pe", "mil.pe", "sld.pe", "org.pe",
|
||||
"com.pe", "net.pe", "com.ph", "net.ph", "org.ph", "mil.ph", "ngo.ph", "i.ph", "gov.ph", "edu.ph", "com.pk", "net.pk", "edu.pk", "org.pk",
|
||||
"fam.pk", "biz.pk", "web.pk", "gov.pk", "gob.pk", "gok.pk", "gon.pk", "gop.pk", "gos.pk", "pwr.pl", "com.pl", "biz.pl", "net.pl",
|
||||
"art.pl", "edu.pl", "org.pl", "ngo.pl", "gov.pl", "info.pl", "mil.pl", "waw.pl", "warszawa.pl", "wroc.pl", "wroclaw.pl", "krakow.pl",
|
||||
"katowice.pl", "poznan.pl", "lodz.pl", "gda.pl", "gdansk.pl", "slupsk.pl", "radom.pl", "szczecin.pl", "lublin.pl", "bialystok.pl",
|
||||
"olsztyn.pl", "torun.pl", "gorzow.pl", "zgora.pl", "biz.pr", "com.pr", "edu.pr", "gov.pr", "info.pr", "isla.pr", "name.pr", "net.pr",
|
||||
"org.pr", "pro.pr", "est.pr", "prof.pr", "ac.pr", "com.ps", "net.ps", "org.ps", "edu.ps", "gov.ps", "plo.ps", "sec.ps", "co.pw", "ne.pw",
|
||||
"or.pw", "ed.pw", "go.pw", "belau.pw", "arts.ro", "com.ro", "firm.ro", "info.ro", "nom.ro", "nt.ro", "org.ro", "rec.ro", "store.ro",
|
||||
"tm.ro", "www.ro", "co.rs", "org.rs", "edu.rs", "ac.rs", "gov.rs", "in.rs", "com.sb", "net.sb", "edu.sb", "org.sb", "gov.sb", "com.sc",
|
||||
"net.sc", "edu.sc", "gov.sc", "org.sc", "co.sh", "com.sh", "org.sh", "gov.sh", "edu.sh", "net.sh", "nom.sh", "com.sl", "net.sl",
|
||||
"org.sl", "edu.sl", "gov.sl", "gov.st", "saotome.st", "principe.st", "consulado.st", "embaixada.st", "org.st", "edu.st", "net.st",
|
||||
"com.st", "store.st", "mil.st", "co.st", "edu.sv", "gob.sv", "com.sv", "org.sv", "red.sv", "co.sz", "ac.sz", "org.sz", "com.tr",
|
||||
"gen.tr", "org.tr", "biz.tr", "info.tr", "av.tr", "dr.tr", "pol.tr", "bel.tr", "tsk.tr", "bbs.tr", "k12.tr", "edu.tr", "name.tr",
|
||||
"net.tr", "gov.tr", "web.tr", "tel.tr", "tv.tr", "co.tt", "com.tt", "org.tt", "net.tt", "biz.tt", "info.tt", "pro.tt", "int.tt",
|
||||
"coop.tt", "jobs.tt", "mobi.tt", "travel.tt", "museum.tt", "aero.tt", "cat.tt", "tel.tt", "name.tt", "mil.tt", "edu.tt", "gov.tt",
|
||||
"edu.tw", "gov.tw", "mil.tw", "com.tw", "net.tw", "org.tw", "idv.tw", "game.tw", "ebiz.tw", "club.tw", "com.mu", "gov.mu", "net.mu",
|
||||
"org.mu", "ac.mu", "co.mu", "or.mu", "ac.mz", "co.mz", "edu.mz", "org.mz", "gov.mz", "com.na", "co.na", "ac.nz", "co.nz", "cri.nz",
|
||||
"geek.nz", "gen.nz", "govt.nz", "health.nz", "iwi.nz", "maori.nz", "mil.nz", "net.nz", "org.nz", "parliament.nz", "school.nz", "abo.pa",
|
||||
"ac.pa", "com.pa", "edu.pa", "gob.pa", "ing.pa", "med.pa", "net.pa", "nom.pa", "org.pa", "sld.pa", "com.pt", "edu.pt", "gov.pt",
|
||||
"int.pt", "net.pt", "nome.pt", "org.pt", "publ.pt", "com.py", "edu.py", "gov.py", "mil.py", "net.py", "org.py", "com.qa", "edu.qa",
|
||||
"gov.qa", "mil.qa", "net.qa", "org.qa", "asso.re", "com.re", "nom.re", "ac.ru", "adygeya.ru", "altai.ru", "amur.ru", "arkhangelsk.ru",
|
||||
"astrakhan.ru", "bashkiria.ru", "belgorod.ru", "bir.ru", "bryansk.ru", "buryatia.ru", "cbg.ru", "chel.ru", "chelyabinsk.ru", "chita.ru",
|
||||
"chukotka.ru", "chuvashia.ru", "com.ru", "dagestan.ru", "e-burg.ru", "edu.ru", "gov.ru", "grozny.ru", "int.ru", "irkutsk.ru",
|
||||
"ivanovo.ru", "izhevsk.ru", "jar.ru", "joshkar-ola.ru", "kalmykia.ru", "kaluga.ru", "kamchatka.ru", "karelia.ru", "kazan.ru", "kchr.ru",
|
||||
"kemerovo.ru", "khabarovsk.ru", "khakassia.ru", "khv.ru", "kirov.ru", "koenig.ru", "komi.ru", "kostroma.ru", "kranoyarsk.ru", "kuban.ru",
|
||||
"kurgan.ru", "kursk.ru", "lipetsk.ru", "magadan.ru", "mari.ru", "mari-el.ru", "marine.ru", "mil.ru", "mordovia.ru", "mosreg.ru",
|
||||
"msk.ru", "murmansk.ru", "nalchik.ru", "net.ru", "nnov.ru", "nov.ru", "novosibirsk.ru", "nsk.ru", "omsk.ru", "orenburg.ru", "org.ru",
|
||||
"oryol.ru", "penza.ru", "perm.ru", "pp.ru", "pskov.ru", "ptz.ru", "rnd.ru", "ryazan.ru", "sakhalin.ru", "samara.ru", "saratov.ru",
|
||||
"simbirsk.ru", "smolensk.ru", "spb.ru", "stavropol.ru", "stv.ru", "surgut.ru", "tambov.ru", "tatarstan.ru", "tom.ru", "tomsk.ru",
|
||||
"tsaritsyn.ru", "tsk.ru", "tula.ru", "tuva.ru", "tver.ru", "tyumen.ru", "udm.ru", "udmurtia.ru", "ulan-ude.ru", "vladikavkaz.ru",
|
||||
"vladimir.ru", "vladivostok.ru", "volgograd.ru", "vologda.ru", "voronezh.ru", "vrn.ru", "vyatka.ru", "yakutia.ru", "yamal.ru",
|
||||
"yekaterinburg.ru", "yuzhno-sakhalinsk.ru", "ac.rw", "co.rw", "com.rw", "edu.rw", "gouv.rw", "gov.rw", "int.rw", "mil.rw", "net.rw",
|
||||
"com.sa", "edu.sa", "gov.sa", "med.sa", "net.sa", "org.sa", "pub.sa", "sch.sa", "com.sd", "edu.sd", "gov.sd", "info.sd", "med.sd",
|
||||
"net.sd", "org.sd", "tv.sd", "a.se", "ac.se", "b.se", "bd.se", "c.se", "d.se", "e.se", "f.se", "g.se", "h.se", "i.se", "k.se", "l.se",
|
||||
"m.se", "n.se", "o.se", "org.se", "p.se", "parti.se", "pp.se", "press.se", "r.se", "s.se", "t.se", "tm.se", "u.se", "w.se", "x.se",
|
||||
"y.se", "z.se", "com.sg", "edu.sg", "gov.sg", "idn.sg", "net.sg", "org.sg", "per.sg", "art.sn", "com.sn", "edu.sn", "gouv.sn", "org.sn",
|
||||
"perso.sn", "univ.sn", "com.sy", "edu.sy", "gov.sy", "mil.sy", "net.sy", "news.sy", "org.sy", "ac.th", "co.th", "go.th", "in.th",
|
||||
"mi.th", "net.th", "or.th", "ac.tj", "biz.tj", "co.tj", "com.tj", "edu.tj", "go.tj", "gov.tj", "info.tj", "int.tj", "mil.tj", "name.tj",
|
||||
"net.tj", "nic.tj", "org.tj", "test.tj", "web.tj", "agrinet.tn", "com.tn", "defense.tn", "edunet.tn", "ens.tn", "fin.tn", "gov.tn",
|
||||
"ind.tn", "info.tn", "intl.tn", "mincom.tn", "nat.tn", "net.tn", "org.tn", "perso.tn", "rnrt.tn", "rns.tn", "rnu.tn", "tourism.tn",
|
||||
"ac.tz", "co.tz", "go.tz", "ne.tz", "or.tz", "biz.ua", "cherkassy.ua", "chernigov.ua", "chernovtsy.ua", "ck.ua", "cn.ua", "co.ua",
|
||||
"com.ua", "crimea.ua", "cv.ua", "dn.ua", "dnepropetrovsk.ua", "donetsk.ua", "dp.ua", "edu.ua", "gov.ua", "if.ua", "in.ua",
|
||||
"ivano-frankivsk.ua", "kh.ua", "kharkov.ua", "kherson.ua", "khmelnitskiy.ua", "kiev.ua", "kirovograd.ua", "km.ua", "kr.ua", "ks.ua",
|
||||
"kv.ua", "lg.ua", "lugansk.ua", "lutsk.ua", "lviv.ua", "me.ua", "mk.ua", "net.ua", "nikolaev.ua", "od.ua", "odessa.ua", "org.ua",
|
||||
"pl.ua", "poltava.ua", "pp.ua", "rovno.ua", "rv.ua", "sebastopol.ua", "sumy.ua", "te.ua", "ternopil.ua", "uzhgorod.ua", "vinnica.ua",
|
||||
"vn.ua", "zaporizhzhe.ua", "zhitomir.ua", "zp.ua", "zt.ua", "ac.ug", "co.ug", "go.ug", "ne.ug", "or.ug", "org.ug", "sc.ug", "ac.uk",
|
||||
"bl.uk", "british-library.uk", "co.uk", "cym.uk", "gov.uk", "govt.uk", "icnet.uk", "jet.uk", "lea.uk", "ltd.uk", "me.uk", "mil.uk",
|
||||
"mod.uk", "national-library-scotland.uk", "nel.uk", "net.uk", "nhs.uk", "nic.uk", "nls.uk", "org.uk", "orgn.uk", "parliament.uk",
|
||||
"plc.uk", "police.uk", "sch.uk", "scot.uk", "soc.uk", "dni.us", "fed.us", "isa.us", "kids.us", "nsn.us", "com.uy", "edu.uy", "gub.uy",
|
||||
"mil.uy", "net.uy", "org.uy", "co.ve", "com.ve", "edu.ve", "gob.ve", "info.ve", "mil.ve", "net.ve", "org.ve", "web.ve", "co.vi",
|
||||
"com.vi", "k12.vi", "net.vi", "org.vi", "ac.vn", "biz.vn", "com.vn", "edu.vn", "gov.vn", "health.vn", "info.vn", "int.vn", "name.vn",
|
||||
"net.vn", "org.vn", "pro.vn", "co.ye", "com.ye", "gov.ye", "ltd.ye", "me.ye", "net.ye", "org.ye", "plc.ye", "ac.yu", "co.yu", "edu.yu",
|
||||
"gov.yu", "org.yu", "ac.za", "agric.za", "alt.za", "bourse.za", "city.za", "co.za", "cybernet.za", "db.za", "ecape.school.za", "edu.za",
|
||||
"fs.school.za", "gov.za", "gp.school.za", "grondar.za", "iaccess.za", "imt.za", "inca.za", "kzn.school.za", "landesign.za", "law.za",
|
||||
"lp.school.za", "mil.za", "mpm.school.za", "ncape.school.za", "net.za", "ngo.za", "nis.za", "nom.za", "nw.school.za", "olivetti.za",
|
||||
"org.za", "pix.za", "school.za", "tm.za", "wcape.school.za", "web.za", "ac.zm", "co.zm", "com.zm", "edu.zm", "gov.zm", "net.zm",
|
||||
"org.zm", "sch.zm", "e164.arpa", "au.com", "br.com", "cn.com", "de.com", "eu.com", "gb.com", "hu.com", "no.com", "qc.com", "ru.com",
|
||||
"sa.com", "se.com", "uk.com", "us.com", "uy.com", "za.com", "de.net", "gb.net", "uk.net", "dk.org", "eu.org", "edu.ac", "com.ae",
|
||||
"com.ai", "edu.ai", "gov.ai", "org.ai", "uba.ar", "esc.edu.ar", "priv.at", "conf.au", "info.au", "otc.au", "oz.au", "telememo.au",
|
||||
"com.az", "net.az", "org.az", "ac.be", "belgie.be", "dns.be", "fgov.be", "com.bm", "edu.bm", "gov.bm", "net.bm", "org.bm", "sp.br",
|
||||
"hk.cn", "mo.cn", "arts.co", "firm.co", "info.co", "int.co", "rec.co", "store.co", "web.co", "com.cu", "net.cu", "org.cu", "co.dk",
|
||||
"ass.dz", "k12.ec", "gov.fj", "id.fj", "school.fj", "com.fk", "aeroport.fr", "assedic.fr", "avocat.fr", "avoues.fr", "barreau.fr",
|
||||
"cci.fr", "chambagri.fr", "chirurgiens-dentistes.fr", "experts-comptables.fr", "geometre-expert.fr", "greta.fr", "huissier-justice.fr",
|
||||
"medecin.fr", "notaires.fr", "pharmacien.fr", "port.fr", "veterinaire.fr", "com.ge", "edu.ge", "gov.ge", "mil.ge", "net.ge", "org.ge",
|
||||
"pvt.ge", "ac.gg", "alderney.gg", "gov.gg", "guernsey.gg", "ind.gg", "ltd.gg", "sark.gg", "sch.gg", "mil.gu", "2000.hu", "agrar.hu",
|
||||
"bolt.hu", "casino.hu", "city.hu", "co.hu", "erotica.hu", "erotika.hu", "film.hu", "forum.hu", "games.hu", "hotel.hu", "info.hu",
|
||||
"ingatlan.hu", "jogasz.hu", "konyvelo.hu", "lakas.hu", "media.hu", "news.hu", "org.hu", "priv.hu", "reklam.hu", "sex.hu", "shop.hu",
|
||||
"sport.hu", "suli.hu", "szex.hu", "tm.hu", "tozsde.hu", "utazas.hu", "video.hu", "ac.im", "co.im", "gov.im", "net.im", "nic.im",
|
||||
"org.im", "ac.je", "gov.je", "ind.je", "jersey.je", "ltd.je", "sch.je", "aichi.jp", "akita.jp", "aomori.jp", "chiba.jp", "ehime.jp",
|
||||
"fukui.jp", "fukuoka.jp", "fukushima.jp", "gifu.jp", "gov.jp", "gunma.jp", "hiroshima.jp", "hokkaido.jp", "hyogo.jp", "ibaraki.jp",
|
||||
"ishikawa.jp", "iwate.jp", "kagawa.jp", "kagoshima.jp", "kanagawa.jp", "kanazawa.jp", "kawasaki.jp", "kitakyushu.jp", "kobe.jp",
|
||||
"kochi.jp", "kumamoto.jp", "kyoto.jp", "matsuyama.jp", "mie.jp", "miyagi.jp", "miyazaki.jp", "nagano.jp", "nagasaki.jp", "nagoya.jp",
|
||||
"nara.jp", "net.jp", "niigata.jp", "oita.jp", "okayama.jp", "okinawa.jp", "org.jp", "osaka.jp", "saga.jp", "saitama.jp", "sapporo.jp",
|
||||
"sendai.jp", "shiga.jp", "shimane.jp", "shizuoka.jp", "takamatsu.jp", "tochigi.jp", "tokushima.jp", "tokyo.jp", "tottori.jp",
|
||||
"toyama.jp", "utsunomiya.jp", "wakayama.jp", "yamagata.jp", "yamaguchi.jp", "yamanashi.jp", "yokohama.jp", "kyonggi.kr", "com.la",
|
||||
"net.la", "org.la", "mil.lb", "com.lc", "edu.lc", "gov.lc", "net.lc", "org.lc", "com.mm", "edu.mm", "gov.mm", "net.mm", "org.mm",
|
||||
"tm.mt", "uu.mt", "alt.na", "cul.na", "edu.na", "net.na", "org.na", "telecom.na", "unam.na", "com.nc", "net.nc", "org.nc", "ac.ng",
|
||||
"tel.no", "fax.nr", "mob.nr", "mobil.nr", "mobile.nr", "tel.nr", "tlf.nr", "mod.om", "ac.pg", "com.pg", "net.pg", "agro.pl", "aid.pl",
|
||||
"atm.pl", "auto.pl", "gmina.pl", "gsm.pl", "mail.pl", "media.pl", "miasta.pl", "nieruchomosci.pl", "nom.pl", "pc.pl", "powiat.pl",
|
||||
"priv.pl", "realestate.pl", "rel.pl", "sex.pl", "shop.pl", "sklep.pl", "sos.pl", "szkola.pl", "targi.pl", "tm.pl", "tourism.pl",
|
||||
"travel.pl", "turystyka.pl", "sch.sd", "mil.sh", "mil.tr", "at.tt", "au.tt", "be.tt", "ca.tt", "de.tt", "dk.tt", "es.tt", "eu.tt",
|
||||
"fr.tt", "it.tt", "nic.tt", "se.tt", "uk.tt", "us.tt", "co.tv", "gove.tw", "edu.uk", "arts.ve", "bib.ve", "firm.ve", "gov.ve", "int.ve",
|
||||
"nom.ve", "rec.ve", "store.ve", "tec.ve", "ch.vu", "com.vu", "de.vu", "edu.vu", "fr.vu", "net.vu", "org.vu", "com.ws", "edu.ws",
|
||||
"gov.ws", "net.ws", "org.ws", "edu.ye", "mil.ye", "ac.zw", "co.zw", "gov.zw", "org.zw" };
|
||||
|
||||
static {
|
||||
// using http://javainetlocator.sourceforge.net/ if library is present
|
||||
// we use this class using reflection to be able to remove it because that class is old and without maintenancy
|
||||
InetAddressLocatorClass = ClassProvider.load("net.sf.javainetlocator.InetAddressLocator", new File("lib/InetAddressLocator.jar"));
|
||||
InetAddressLocatorGetLocaleInetAddressMethod = ClassProvider.getStaticMethod(InetAddressLocatorClass, "getLocale", new Class[]{InetAddress.class});
|
||||
ccSLD_TLD.addAll(Arrays.asList(ccSLD_TLD_list));
|
||||
}
|
||||
|
||||
private static Map<String, Integer> TLDID = new ConcurrentHashMap<String, Integer>(32);
|
||||
//private static HashMap<String, String> TLDName = new HashMap<String, String>();
|
||||
|
@ -1019,6 +1159,22 @@ public class Domains {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the Domain Class Name, which is either the top-level-domain or
|
||||
* a combination of the second-level-domain plus top-level-domain if the second-level-domain
|
||||
* is a ccSLD ("country code second-level domain"). Such names can be taken from a list of ccSLDs.
|
||||
* @param host
|
||||
* @return the TLD or ccSLD+TLD if that is on a list
|
||||
*/
|
||||
public static String getDNC(String host) {
|
||||
int p0 = host.lastIndexOf('.');
|
||||
if (p0 < 0) return host.toLowerCase();
|
||||
int p1 = host.lastIndexOf('.', p0 - 1);
|
||||
if (p1 < 0) return host.substring(p0 + 1).toLowerCase();
|
||||
String ccSLDTLD = host.substring(p1 + 1).toLowerCase();
|
||||
return ccSLD_TLD.contains(ccSLDTLD) ? ccSLDTLD : host.substring(p0 + 1).toLowerCase();
|
||||
}
|
||||
|
||||
public static void main(final String[] args) {
|
||||
/*
|
||||
try {
|
||||
|
|
|
@ -86,7 +86,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
};
|
||||
private static final Set<String> SOLR_FIELDS = new HashSet<String>();
|
||||
static {
|
||||
field2tag.put(YaCySchema.language_txt.name(), GSAToken.LANG.name());
|
||||
field2tag.put(YaCySchema.language_s.name(), GSAToken.LANG.name());
|
||||
SOLR_FIELDS.addAll(field2tag.keySet());
|
||||
for (YaCySchema field: extrafields) SOLR_FIELDS.add(field.name());
|
||||
}
|
||||
|
|
|
@ -26,25 +26,37 @@
|
|||
package net.yacy.cora.services.federated.solr;
|
||||
|
||||
public enum SolrType {
|
||||
string,
|
||||
text_general,
|
||||
text_en_splitting_tight,
|
||||
location,
|
||||
date,
|
||||
integer("int"),
|
||||
bool("boolean"),
|
||||
tlong("long"),
|
||||
tfloat("float"),
|
||||
tdouble("double");
|
||||
string("s", "sxt"), // The type is not analyzed, but indexed/stored verbatim
|
||||
text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms.
|
||||
text_en_splitting_tight(null, null),// can insert dashes in the wrong place and still match
|
||||
location("p", null), // lat,lon - format: specialized field for geospatial search. If indexed, this fieldType must not be multivalued.
|
||||
date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
|
||||
integer("i", "val", "int"),
|
||||
bool("b", null, "boolean"),
|
||||
tlong(null, null, "long"), // not used in schema yet
|
||||
tfloat(null, null, "float"), // not used in schema yet
|
||||
tdouble(null, null, "double"); // not used in schema yet
|
||||
|
||||
private String printName;
|
||||
private SolrType() {
|
||||
private String printName, singlevalExt, multivalExt;
|
||||
private SolrType(final String singlevalExt, final String multivalExt) {
|
||||
this.printName = this.name();
|
||||
this.singlevalExt = singlevalExt;
|
||||
this.multivalExt = multivalExt;
|
||||
}
|
||||
private SolrType(String printName) {
|
||||
private SolrType(final String singlevalExt, final String multivalExt, final String printName) {
|
||||
this.printName = printName;
|
||||
this.singlevalExt = singlevalExt;
|
||||
this.multivalExt = multivalExt;
|
||||
}
|
||||
public String printName() {
|
||||
return this.printName;
|
||||
}
|
||||
public boolean appropriateName(final String field, final boolean multivalue) {
|
||||
int p = field.indexOf('_');
|
||||
if (p < 0 || field.length() - p > 4) return true; // special names may have no type extension
|
||||
String ext = field.substring(p + 1);
|
||||
boolean ok = multivalue ? this.multivalExt.equals(ext) : this.singlevalExt.equals(ext);
|
||||
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(multivalue).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
|
||||
return ok;
|
||||
}
|
||||
}
|
|
@ -82,8 +82,8 @@ public class URIMetadataNode implements URIMetadata {
|
|||
this.audioc = getInt(YaCySchema.audiolinkscount_i);
|
||||
this.videoc = getInt(YaCySchema.videolinkscount_i);
|
||||
this.appc = getInt(YaCySchema.videolinkscount_i);
|
||||
this.lon = getDouble(YaCySchema.lon_coordinate);
|
||||
this.lat = getDouble(YaCySchema.lat_coordinate);
|
||||
this.lon = 0.0d;
|
||||
this.lat = 0.0d;
|
||||
String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name());
|
||||
if (latlon != null) {
|
||||
int p = latlon.indexOf(',');
|
||||
|
@ -120,13 +120,13 @@ public class URIMetadataNode implements URIMetadata {
|
|||
if (x == null) return 0;
|
||||
return x.intValue();
|
||||
}
|
||||
|
||||
/*
|
||||
private double getDouble(YaCySchema field) {
|
||||
Double x = (Double) this.doc.getFieldValue(field.name());
|
||||
if (x == null) return 0.0d;
|
||||
return x.doubleValue();
|
||||
}
|
||||
|
||||
*/
|
||||
private Date getDate(YaCySchema field) {
|
||||
Date x = (Date) this.doc.getFieldValue(field.name());
|
||||
if (x == null) return new Date(0);
|
||||
|
@ -239,9 +239,9 @@ public class URIMetadataNode implements URIMetadata {
|
|||
|
||||
@Override
|
||||
public byte[] language() {
|
||||
ArrayList<String> languages = getArrayList(YaCySchema.language_txt);
|
||||
if (languages == null || languages.size() == 0) return ASCII.getBytes("en");
|
||||
return UTF8.getBytes(languages.get(0));
|
||||
String language = getString(YaCySchema.language_s);
|
||||
if (language == null || language.length() == 0) return ASCII.getBytes("en");
|
||||
return UTF8.getBytes(language);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -41,6 +41,7 @@ import java.util.Set;
|
|||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.storage.ConfigurationSet;
|
||||
|
@ -91,7 +92,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
YaCySchema f = YaCySchema.valueOf(etr.key());
|
||||
f.setSolrFieldName(etr.getValue());
|
||||
} catch (IllegalArgumentException e) {
|
||||
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'");
|
||||
Log.logFine("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'");
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
|
@ -199,7 +200,20 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
final InetAddress address = digestURI.getInetAddress();
|
||||
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
|
||||
}
|
||||
if (digestURI.getHost() != null) add(doc, YaCySchema.host_s, digestURI.getHost());
|
||||
if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
|
||||
String host = null;
|
||||
if ((host = digestURI.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(host);
|
||||
String subdomOrga = host.substring(0, host.length() - dnc.length() - 1);
|
||||
int p = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
|
||||
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
|
||||
if (allAttr || contains(YaCySchema.host_s)) add(doc, YaCySchema.host_s, host);
|
||||
if (allAttr || contains(YaCySchema.host_dnc_s)) add(doc, YaCySchema.host_dnc_s, dnc);
|
||||
if (allAttr || contains(YaCySchema.host_organization_s)) add(doc, YaCySchema.host_organization_s, orga);
|
||||
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, md.dc_title());
|
||||
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, md.dc_creator());
|
||||
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, md.snippet());
|
||||
|
@ -232,8 +246,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
|
||||
// coordinates
|
||||
if (md.lat() != 0.0f && md.lon() != 0.0f) {
|
||||
if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, md.lat());
|
||||
if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, md.lon());
|
||||
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
|
||||
|
@ -245,7 +257,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && md.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(md.referrerHash())});
|
||||
if (allAttr || contains(YaCySchema.md5_s)) add(doc, YaCySchema.md5_s, md.md5());
|
||||
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, md.dc_publisher());
|
||||
if ((allAttr || contains(YaCySchema.language_txt)) && md.language() != null) add(doc, YaCySchema.language_txt,new String[]{UTF8.String(md.language())});
|
||||
if ((allAttr || contains(YaCySchema.language_s)) && md.language() != null) add(doc, YaCySchema.language_s, UTF8.String(md.language()));
|
||||
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, md.size());
|
||||
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, md.laudio());
|
||||
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, md.lvideo());
|
||||
|
@ -285,7 +297,20 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
final InetAddress address = digestURI.getInetAddress();
|
||||
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
|
||||
}
|
||||
if (digestURI.getHost() != null) add(doc, YaCySchema.host_s, digestURI.getHost());
|
||||
if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
|
||||
String host = null;
|
||||
if ((host = digestURI.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(host);
|
||||
String subdomOrga = host.substring(0, host.length() - dnc.length() - 1);
|
||||
int p = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
|
||||
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
|
||||
if (allAttr || contains(YaCySchema.host_s)) add(doc, YaCySchema.host_s, host);
|
||||
if (allAttr || contains(YaCySchema.host_dnc_s)) add(doc, YaCySchema.host_dnc_s, dnc);
|
||||
if (allAttr || contains(YaCySchema.host_organization_s)) add(doc, YaCySchema.host_organization_s, orga);
|
||||
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, yacydoc.dc_title());
|
||||
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator());
|
||||
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, yacydoc.dc_description());
|
||||
|
@ -411,7 +436,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
}
|
||||
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, imgtags.size());
|
||||
if (allAttr || contains(YaCySchema.images_tag_txt)) add(doc, YaCySchema.images_tag_txt, imgtags);
|
||||
if (allAttr || contains(YaCySchema.images_protocol_txt)) add(doc, YaCySchema.images_protocol_txt, protocolList2indexedList(imgprots));
|
||||
if (allAttr || contains(YaCySchema.images_protocol_sxt)) add(doc, YaCySchema.images_protocol_sxt, protocolList2indexedList(imgprots));
|
||||
if (allAttr || contains(YaCySchema.images_urlstub_txt)) add(doc, YaCySchema.images_urlstub_txt, imgstubs);
|
||||
if (allAttr || contains(YaCySchema.images_alt_txt)) add(doc, YaCySchema.images_alt_txt, imgalts);
|
||||
|
||||
|
@ -479,12 +504,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
}
|
||||
|
||||
// canonical tag
|
||||
if (allAttr || contains(YaCySchema.canonical_s)) {
|
||||
if (allAttr || contains(YaCySchema.canonical_t)) {
|
||||
final MultiProtocolURI canonical = html.getCanonical();
|
||||
if (canonical != null) {
|
||||
inboundLinks.remove(canonical);
|
||||
ouboundLinks.remove(canonical);
|
||||
add(doc, YaCySchema.canonical_s, canonical.toNormalform(false, false));
|
||||
add(doc, YaCySchema.canonical_t, canonical.toNormalform(false, false));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -565,11 +590,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
c++;
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_tag_txt)) add(doc, YaCySchema.inboundlinks_tag_txt, inboundlinksTag);
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_protocol_txt)) add(doc, YaCySchema.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_protocol_sxt)) add(doc, YaCySchema.inboundlinks_protocol_sxt, protocolList2indexedList(inboundlinksURLProtocol));
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_urlstub_txt)) add(doc, YaCySchema.inboundlinks_urlstub_txt, inboundlinksURLStub);
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_name_txt)) add(doc, YaCySchema.inboundlinks_name_txt, inboundlinksName);
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_rel_txt)) add(doc, YaCySchema.inboundlinks_rel_txt, inboundlinksRel);
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_relflags_txt)) add(doc, YaCySchema.inboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_rel_sxt)) add(doc, YaCySchema.inboundlinks_rel_sxt, inboundlinksRel);
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_relflags_sxt)) add(doc, YaCySchema.inboundlinks_relflags_sxt, relEval(inboundlinksRel));
|
||||
if (allAttr || contains(YaCySchema.inboundlinks_text_txt)) add(doc, YaCySchema.inboundlinks_text_txt, inboundlinksText);
|
||||
|
||||
c = 0;
|
||||
|
@ -603,11 +628,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
c++;
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_tag_txt)) add(doc, YaCySchema.outboundlinks_tag_txt, outboundlinksTag);
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_protocol_txt)) add(doc, YaCySchema.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_protocol_sxt)) add(doc, YaCySchema.outboundlinks_protocol_sxt, protocolList2indexedList(outboundlinksURLProtocol));
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_urlstub_txt)) add(doc, YaCySchema.outboundlinks_urlstub_txt, outboundlinksURLStub);
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_name_txt)) add(doc, YaCySchema.outboundlinks_name_txt, outboundlinksName);
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_rel_txt)) add(doc, YaCySchema.outboundlinks_rel_txt, outboundlinksRel);
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_relflags_txt)) add(doc, YaCySchema.outboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_rel_sxt)) add(doc, YaCySchema.outboundlinks_rel_sxt, outboundlinksRel);
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_relflags_sxt)) add(doc, YaCySchema.outboundlinks_relflags_sxt, relEval(inboundlinksRel));
|
||||
if (allAttr || contains(YaCySchema.outboundlinks_text_txt)) add(doc, YaCySchema.outboundlinks_text_txt, outboundlinksText);
|
||||
|
||||
// charset
|
||||
|
@ -615,8 +640,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
|
||||
// coordinates
|
||||
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
||||
if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, yacydoc.lat());
|
||||
if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, yacydoc.lon());
|
||||
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(yacydoc.lat()) + "," + Double.toString(yacydoc.lon()));
|
||||
}
|
||||
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode());
|
||||
|
@ -628,7 +651,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && metadata.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(metadata.referrerHash())});
|
||||
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
|
||||
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, yacydoc.dc_publisher());
|
||||
if ((allAttr || contains(YaCySchema.language_txt)) && metadata.language() != null) add(doc, YaCySchema.language_txt,new String[]{UTF8.String(metadata.language())});
|
||||
if ((allAttr || contains(YaCySchema.language_s)) && metadata.language() != null) add(doc, YaCySchema.language_s, UTF8.String(metadata.language()));
|
||||
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, metadata.size());
|
||||
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size());
|
||||
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size());
|
||||
|
|
|
@ -35,52 +35,57 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
public enum YaCySchema implements Schema {
|
||||
|
||||
// mandatory
|
||||
id(SolrType.string, true, true, "primary key of document, the URL hash **mandatory field**"),
|
||||
id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"),
|
||||
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
|
||||
last_modified(SolrType.date, true, true, "last-modified from http header"),
|
||||
last_modified(SolrType.date, true, true, false, "last-modified from http header"),
|
||||
content_type(SolrType.string, true, true, true, "mime-type of document"),
|
||||
title(SolrType.text_general, true, true, true, "content of title tag"),
|
||||
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
|
||||
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
|
||||
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
|
||||
process_s(SolrType.string, true, true, "index creation comment"),
|
||||
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
|
||||
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
host_id_s(SolrType.string, true, true, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
|
||||
md5_s(SolrType.string, true, true, false, "the md5 of the raw source"),// String md5();
|
||||
size_i(SolrType.integer, true, true, false, "the size of the raw source"),// int size();
|
||||
process_s(SolrType.string, true, true, false, "index creation comment"),
|
||||
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
|
||||
httpstatus_i(SolrType.integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
httpstatus_redirect_s(SolrType.integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
|
||||
// optional but recommended, part of index distribution
|
||||
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
|
||||
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
|
||||
load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
|
||||
fresh_date_dt(SolrType.date, true, true, false, "date until resource shall be considered as fresh"),
|
||||
referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
|
||||
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
|
||||
language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
|
||||
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
|
||||
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
|
||||
applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp();
|
||||
publisher_t(SolrType.text_general, true, true, false, "the name of the publisher of the document"),// String dc_publisher();
|
||||
language_s(SolrType.string, true, true, false, "the language used in the document"),// byte[] language();
|
||||
audiolinkscount_i(SolrType.integer, true, true, false, "number of links to audio resources"),// int laudio();
|
||||
videolinkscount_i(SolrType.integer, true, true, false, "number of links to video resources"),// int lvideo();
|
||||
applinkscount_i(SolrType.integer, true, true, false, "number of links to application resources"),// int lapp();
|
||||
|
||||
// optional but recommended
|
||||
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"), // deprecated
|
||||
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"), // deprecated
|
||||
coordinate_p(SolrType.location, true, true, "point in degrees of latitude,longitude as declared in WSG84"),
|
||||
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
|
||||
author(SolrType.text_general, true, true, "content of author-tag"),
|
||||
description(SolrType.text_general, true, true, "content of description-tag"),
|
||||
keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"),
|
||||
charset_s(SolrType.string, true, true, "character encoding"),
|
||||
wordcount_i(SolrType.integer, true, true, "number of words in visible area"),
|
||||
inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
|
||||
inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
|
||||
outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
|
||||
outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
|
||||
imagescount_i(SolrType.integer, true, true, "number of images"),
|
||||
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
|
||||
text_t(SolrType.text_general, true, true, "all visible text"),
|
||||
coordinate_p(SolrType.location, true, true, false, "point in degrees of latitude,longitude as declared in WSG84"),
|
||||
ip_s(SolrType.string, true, true, false, "ip of host of url (after DNS lookup)"),
|
||||
author(SolrType.text_general, true, true, false, "content of author-tag"),
|
||||
description(SolrType.text_general, true, true, false, "content of description-tag"),
|
||||
keywords(SolrType.text_general, true, true, false, "content of keywords tag; words are separated by space"),
|
||||
charset_s(SolrType.string, true, true, false, "character encoding"),
|
||||
wordcount_i(SolrType.integer, true, true, false, "number of words in visible area"),
|
||||
inboundlinkscount_i(SolrType.integer, true, true, false, "total number of inbound links"),
|
||||
inboundlinksnofollowcount_i(SolrType.integer, true, true, false, "number of inbound links with nofollow tag"),
|
||||
outboundlinkscount_i(SolrType.integer, true, true, false, "external number of inbound links"),
|
||||
outboundlinksnofollowcount_i(SolrType.integer, true, true, false, "number of external links with nofollow tag"),
|
||||
imagescount_i(SolrType.integer, true, true, false, "number of images"),
|
||||
responsetime_i(SolrType.integer, true, true, false, "response time of target server in milliseconds"),
|
||||
text_t(SolrType.text_general, true, true, false, "all visible text"),
|
||||
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
|
||||
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
|
||||
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
|
||||
h4_txt(SolrType.text_general, true, true, true, "h4 header"),
|
||||
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
|
||||
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
|
||||
|
||||
// optional values
|
||||
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
|
||||
csscount_i(SolrType.integer, true, true, false, "number of entries in css_tag_txt and css_url_txt"),
|
||||
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
|
||||
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
|
||||
scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
|
||||
scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
|
||||
scriptscount_i(SolrType.integer, true, true, false, "number of entries in scripts_txt"),
|
||||
// encoded as binary value into an integer:
|
||||
// bit 0: "all" contained in html header meta
|
||||
// bit 1: "index" contained in html header meta
|
||||
|
@ -91,50 +96,52 @@ public enum YaCySchema implements Schema {
|
|||
// bit 10: "noindex" contained in http header properties
|
||||
// bit 11: "nofollow" contained in http header properties
|
||||
// bit 12: "unavailable_after" contained in http header properties
|
||||
robots_i(SolrType.integer, true, true, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
|
||||
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
|
||||
robots_i(SolrType.integer, true, true, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
|
||||
metagenerator_t(SolrType.text_general, true, true, false, "content of <meta name=\"generator\" content=#content#> tag"),
|
||||
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
|
||||
inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"),
|
||||
inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"),
|
||||
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
|
||||
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
|
||||
inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"),
|
||||
inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
|
||||
inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"),
|
||||
inboundlinks_relflags_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
|
||||
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
|
||||
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
|
||||
outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"),
|
||||
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
|
||||
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
|
||||
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
|
||||
outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"),
|
||||
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
|
||||
outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"),
|
||||
outboundlinks_relflags_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag, coded binary"),
|
||||
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
|
||||
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
|
||||
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
|
||||
images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
|
||||
images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),
|
||||
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
|
||||
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
|
||||
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
|
||||
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
|
||||
h4_txt(SolrType.text_general, true, true, true, "h4 header"),
|
||||
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
|
||||
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
|
||||
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
|
||||
htags_i(SolrType.integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"),
|
||||
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
|
||||
host_s(SolrType.string, true, true, "host of the url"),
|
||||
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
|
||||
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
|
||||
canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"),
|
||||
refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"),
|
||||
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
|
||||
licount_i(SolrType.integer, true, true, "number of <li> tags"),
|
||||
licount_i(SolrType.integer, true, true, false, "number of <li> tags"),
|
||||
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
|
||||
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
|
||||
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
|
||||
boldcount_i(SolrType.integer, true, true, false, "total number of occurrences of <b> or <strong>"),
|
||||
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
|
||||
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
|
||||
italiccount_i(SolrType.integer, true, true, "total number of occurrences of <i>"),
|
||||
flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
|
||||
italiccount_i(SolrType.integer, true, true, false, "total number of occurrences of <i>"),
|
||||
flash_b(SolrType.bool, true, true, false, "flag that shows if a swf file is linked"),
|
||||
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
|
||||
framesscount_i(SolrType.integer, true, true, "number of frames_txt"),
|
||||
framesscount_i(SolrType.integer, true, true, false, "number of frames_txt"),
|
||||
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
|
||||
iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"),
|
||||
iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"),
|
||||
|
||||
host_s(SolrType.string, true, true, false, "host of the url"),
|
||||
host_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
|
||||
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
|
||||
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
|
||||
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
|
||||
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
|
||||
|
||||
// special values; can only be used if '_val' type is defined in schema file; this is not standard
|
||||
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
|
||||
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
|
||||
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
|
||||
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
|
||||
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
|
||||
|
@ -154,23 +161,20 @@ public enum YaCySchema implements Schema {
|
|||
private boolean multiValued, omitNorms;
|
||||
private String comment;
|
||||
|
||||
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final String comment) {
|
||||
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
|
||||
this.type = type;
|
||||
this.indexed = indexed;
|
||||
this.stored = stored;
|
||||
this.multiValued = false;
|
||||
this.multiValued = multiValued;
|
||||
this.omitNorms = false;
|
||||
this.comment = comment;
|
||||
}
|
||||
|
||||
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
|
||||
this(type, indexed, stored, comment);
|
||||
this.multiValued = multiValued;
|
||||
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
|
||||
}
|
||||
|
||||
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
|
||||
this(type, indexed, stored, multiValued, comment);
|
||||
this.omitNorms = omitNorms;
|
||||
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -467,22 +467,29 @@ public final class QueryParams {
|
|||
return ret;
|
||||
}
|
||||
|
||||
final static String[] fields = new String[]{"sku","title","h1_txt","h2_txt","author","description","keywords","text_t"};
|
||||
|
||||
public String solrQueryString(boolean urlencoded) {
|
||||
if (this.query_include_words == null || this.query_include_words.size() == 0) return null;
|
||||
final StringBuilder q = new StringBuilder(80);
|
||||
q.append("{!lucene q.op=AND}");
|
||||
|
||||
// add text query
|
||||
q.append("[sku,title,h1_txt,h2_txt,author,description,keywords,text_t]:");
|
||||
int wc = 0;
|
||||
StringBuilder w = new StringBuilder(80);
|
||||
for (String s: this.query_include_words) {
|
||||
if (wc > 0) q.append(urlencoded ? '+' : ' ');
|
||||
q.append(s);
|
||||
if (wc > 0) w.append(urlencoded ? "+AND+" : " AND ");
|
||||
w.append(s);
|
||||
wc++;
|
||||
}
|
||||
for (String s: this.query_exclude_words){
|
||||
if (wc > 0) q.append(urlencoded ? "+-" : " -");
|
||||
q.append(s);
|
||||
if (wc > 0) w.append(urlencoded ? "+AND+-" : " AND -");
|
||||
w.append(s);
|
||||
wc++;
|
||||
}
|
||||
wc = 0;
|
||||
for (String a: fields) {
|
||||
if (wc > 0) q.append(urlencoded ? "+OR+" : " OR ");
|
||||
q.append('(').append(a).append(':').append(w).append(')');
|
||||
wc++;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user