changes in ranking computation

- an existing ranking servlet for solr was extended. It is now possible
to set boost values for fields, boost functions and boost queries.
- The ranking can have different instances, but currently only the first
one is used
- added an abstraction layer for fields which can be used for search and
those fields can be edited in the solr ranking configruation
- the ranking value from solr within the field score is used to combine
remote search requests, which all are created using the same locally
defined boost values
- reduced the number of fields which are used for search (makes it
faster)
- replaced some text fields by string fields (makes indexing faster)
- removed classes which had no use
- made a large number of experiments for a better ranking and created a
temporary setting which prefers hits inside titles
- adjusted also the RWI-based ranking computation to 'prefer title'
- made special cases like for portal search where no post-processing and
post-ranking is wanted: this keeps the original ranking order as done by
Solr
- fixed many bugs with old settings for ranking
This commit is contained in:
Michael Peter Christen 2013-03-13 14:47:00 +01:00
parent 38f46eb33d
commit addba047e2
34 changed files with 684 additions and 752 deletions

View File

@ -242,7 +242,7 @@ outboundlinks_urlstub_txt
#htags_i
## url inside the canonical link element, string
#canonical_t
#canonical_s
## flag shows if the url in canonical_t is equal to sku, boolean
#canonical_equal_sku_b
@ -287,13 +287,13 @@ underline_txt
#flash_b
## list of all links to frames
#frames_txt
#frames_sxt
## number of attr_frames, int
#framesscount_i
## list of all links to iframes
#iframes_txt
#iframes_sxt
## number of attr_iframes, int
#iframesscount_i

View File

@ -132,9 +132,6 @@ target_urlstub_s
## the file name extension (target)
target_file_ext_s
## normalized (absolute URLs), as <a> - tag with anchor text and nofollow (target)
#target_tag_s
## number of all characters in the url (target)
#target_chars_i

View File

@ -156,11 +156,11 @@
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<dynamicField name="*_val" type="int" indexed="true" stored="true" multiValued="true"/> <!-- YaCy special -->
<dynamicField name="*_val" type="int" indexed="false" stored="true" multiValued="true"/> <!-- YaCy special -->
<dynamicField name="*_sxt" type="string" indexed="true" stored="true" multiValued="true"/> <!-- YaCy special -->
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true" compressed="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>

View File

@ -949,10 +949,35 @@ WikiAccess = admin
# Search Profiles
# we will support different search profiles
# this is currently only a single default profile
# If this profile setting is empty, a hard-coded profile from plasmaSearchRanking is used
# If this profile setting is empty, a hard-coded profile is used to initialise the values
search.ranking.rwi.profile =
search.ranking.solr.boost.tmp2=
# The boost fields contains all fields which shall be searched together with a boost. non-mentioned fields are not searched.
# Boost queries are added to all queries; functions evaluate a value which is either added or multiplied with the ranking.
# The field boostfunctionmode can be either 'add' or 'multiply' to describe the mode.
# All boost methods > 0 must have names to be able to select this name with a query, with the syntax /name
search.ranking.solr.collection.boostname.tmp.0=_default
search.ranking.solr.collection.boostfields.tmp.0=text_t^2.0,url_paths_sxt^20.0,title^100.0,synonyms_sxt^1.0
search.ranking.solr.collection.boostquery.tmp.0=fuzzy_signature_unique_b:true^100000.0
search.ranking.solr.collection.boostfunction.tmp.0=
search.ranking.solr.collection.boostfunctionmode.tmp.0=add
search.ranking.solr.collection.boostname.tmp.1=_date
search.ranking.solr.collection.boostfields.tmp=text_t^1.0
search.ranking.solr.collection.boostquery.tmp.1=
search.ranking.solr.collection.boostfunction.tmp.1=recip(ms(NOW,last_modified),3.16e-11,1,1)
search.ranking.solr.collection.boostfunctionmode.tmp.1=multiply
search.ranking.solr.collection.boostname.tmp.2=_unused2
search.ranking.solr.collection.boostfields.tmp.2=text_t^1.0
search.ranking.solr.collection.boostquery.tmp.2=
search.ranking.solr.collection.boostfunction.tmp.2=div(add(1,references_i),add(url_chars_i,pow(clickdepth_i,3)))
search.ranking.solr.collection.boostfunctionmode.tmp.2=multiply
search.ranking.solr.collection.boostname.tmp.3=_unused3
search.ranking.solr.collection.boostfields.tmp.3=text_t^1.0
search.ranking.solr.collection.boostquery.tmp.3=
search.ranking.solr.collection.boostfunction.tmp.3=
search.ranking.solr.collection.boostfunctionmode.tmp.3=multiply
# the following values are used to identify duplicate content
search.ranking.solr.doubledetection.minlength=3
search.ranking.solr.doubledetection.quantrate=0.5f

View File

@ -0,0 +1,68 @@
/**
* ContentAnalysis_p
* Copyright 2013 by Michael Peter Christen
* First released 12.03.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class ContentAnalysis_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.fulltext().clearCache(); // every time the ranking is changed we need to remove old orderings
if (post != null && post.containsKey("EnterDoublecheck")) {
Ranking.setMinTokenLen(post.getInt("minTokenLen", 3));
Ranking.setQuantRate(post.getFloat("quantRate", 0.5f));
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, Ranking.getMinTokenLen());
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, Ranking.getQuantRate());
}
if (post != null && post.containsKey("ResetDoublecheck")) {
Ranking.setMinTokenLen(3);
Ranking.setQuantRate(0.5f);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, Ranking.getMinTokenLen());
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, Ranking.getQuantRate());
}
if (post != null && post.containsKey("ResetRanking")) {
Ranking.setMinTokenLen(3);
Ranking.setQuantRate(0.5f);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, Ranking.getMinTokenLen());
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, Ranking.getQuantRate());
}
final serverObjects prop = new serverObjects();
prop.put("minTokenLen", Ranking.getMinTokenLen());
prop.put("quantRate", Ranking.getQuantRate());
return prop;
}
}

View File

@ -8,28 +8,48 @@
#%env/templates/header.template%#
#%env/templates/submenuSearchConfiguration.template%#
<h2>Solr Ranking Configuration</h2>
<p>These are ranking attributes for Solr. This ranking applies for internal and remote Solr access.</p>
<form class="dsearch" action="RankingSolr_p.html" method="post" enctype="multipart/form-data">
<p>These are ranking attributes for Solr. This ranking applies for internal and remote (P2P or shard) Solr access.</p>
<form class="dsearch" action="RankingSolr_p.html" method="post" enctype="multipart/form-data">
<fieldset>
<legend>Solr Double Content Detection</legend><p>Double-Content detection is done using a ranking on a 'unique'-Field, named 'fuzzy_signature_unique_b'.
This field is set during parsing and is influenced by two attributes for the <a href="http://lucene.apache.org/solr/api-4_0_0-BETA/org/apache/solr/update/processor/TextProfileSignature.html">TextProfileSignature</a> class.</p>
<legend>Boost Function</legend>
A Boost Function can combine numeric values from the result document to produce a number which is either added or multiplied with the other boost value from the query result.
To see all available fields, see the <a href="IndexSchema_p.html">YaCy Solr Schema</a> and look for numeric values (these are names with suffix '_i').
To find out which kind of operations are possible, see the <a href="http://wiki.apache.org/solr/FunctionQuery">Solr Function Query</a> documentation.
Example: to order by date, use "recip(ms(NOW,last_modified),3.16e-11,1,1)", to order by clickdepth, use "div(100,add(clickdepth_i,1))".
<dl>
<dt style="width:260px"><label for="minTokenLen">minTokenLen</label></dt>
<dd style="width:360px; float:left; display:inline;" id="dd_minTokenLen">
<input name="minTokenLen" id="minTokenLen" type="text" align="right" size="10" value="#[minTokenLen]#" /><br />
This is the minimum length of a word which shall be considered as element of the signature. Should be either 2 or 3.
<dt style="width:260px;margin:0;padding:0;height:1.8em;"><label for="bf" id="bf_label">#[modeKey]#</label></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;" id="bf_dd">
<input name="bf" id="bf" type="text" align="left" size="100" value="#[bf]#" />
</dd>
<dt style="width:260px"><label for="quantRate">quantRate</label></dt>
<dd style="width:360px; float:left; display:inline;" id="dd_quantRate">
<input name="quantRate" id="quantRate" type="text" align="right" size="10" value="#[quantRate]#" /><br />
The quantRate is a measurement for the number of words that take part in a signature computation. The higher the number, the less
words are used for the signature.
For minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5.
<dt style="width:260px;margin:0;padding:0;height:1.8em;"><label for="bq">mode</label></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;" id="bf_dd">
<input type="radio" name="mode" id="add" onclick="document.getElementById('bf_label').innerHTML='bf'" value="add" #(add.checked)#:: checked="checked"#(/add.checked)# />add&nbsp;&nbsp;&nbsp;
<input type="radio" name="mode" id="multiply" onclick="document.getElementById('bf_label').innerHTML='boost'" value="multiply" #(multiply.checked)#:: checked="checked"#(/multiply.checked)# />multiply
</dd>
<dt style="width:260px"></dt>
<dd style="width:360px; float:left; display:inline;">
<input type="submit" name="EnterDoublecheck" value="Set" />
<input type="submit" name="ResetDoublecheck" value="Re-Set to default" />
<dt style="width:260px;margin:0;padding:0;height:1.8em;"></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;">
<input type="submit" name="EnterBF" value="Set Boost Function" />
<input type="submit" name="ResetBF" value="Re-Set to default" />
</dd>
</dl>
</fieldset>
</form>
<form class="dsearch" action="RankingSolr_p.html" method="post" enctype="multipart/form-data">
<fieldset>
<legend>Boost Query</legend>
The Boost Query is attached to every query. Use this to statically boost specific content in the index.
Example: "fuzzy_signature_unique_b:true^100000.0f" means that documents, identified as 'double' are ranked very bad and appended to the end of all results (because the unique are ranked high).
To find appropriate fields for this query, see the <a href="IndexSchema_p.html">YaCy Solr Schema</a> and look for boolean values (with suffix '_b') or tags inside string fields (with suffix '_s' or '_sxt').
<dl>
<dt style="width:260px;margin:0;padding:0;height:1.8em;"><label for="bq" id="bq_label">bq</label></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;" id="bq_dd">
<input name="bq" id="bq" type="text" align="left" size="100" value="#[bq]#" />
</dd>
<dt style="width:260px;margin:0;padding:0;height:1.8em;"></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;">
<input type="submit" name="EnterBQ" value="Set Boost Query" />
<input type="submit" name="ResetBQ" value="Re-Set to default" />
</dd>
</dl>
</fieldset>
@ -37,15 +57,19 @@
<form class="dsearch" action="RankingSolr_p.html" method="post" enctype="multipart/form-data">
<fieldset>
<legend>Solr Boosts</legend>
<dl>#{boosts}#
<dt style="width:260px"><label for="boost_#[field]#">#[field]#</label></dt>
<dd style="width:360px; float:left; display:inline;" id="boost_dd_#[field]#">
This is the set of searchable fields. Entries without a boost value are not searched. Boost values make hits inside the corresponding field more important.
<dl style="margin:0;">#{boosts}#
<dt style="width:260px;margin:0;padding:0;height:1.8em;"><label for="boost_#[field]#">#[field]#</label>
<input type="checkbox" id="#[field]#" name="#[field]#"#(checked)#:: checked="checked"#(/checked)#
onclick="if (document.getElementById('#[field]#').checked) document.getElementById('boost_#[field]#').value='1.0'; else document.getElementById('boost_#[field]#').value='';"/>
</dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;" id="boost_dd_#[field]#">
<input name="boost_#[field]#" id="boost_#[field]#" type="text" align="right" size="10" value="#[boost]#" />
</dd>#{/boosts}#
<dt style="width:260px"></dt>
<dd style="width:360px; float:left; display:inline;">
<input type="submit" name="EnterRanking" value="Set" />
<input type="submit" name="ResetRanking" value="Re-Set to default" />
<dt style="width:260px;margin:0;padding:0;height:1.8em;"></dt>
<dd style="width:360px;margin:0;padding:0;height:1.8em;float:left;display:inline;">
<input type="submit" name="EnterBoosts" value="Set Field Boosts" />
<input type="submit" name="ResetBoosts" value="Re-Set to default" />
</dd>
</dl>
</fieldset>

View File

@ -20,7 +20,8 @@
import java.util.Map;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@ -38,30 +39,18 @@ public class RankingSolr_p {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.fulltext().clearCache(); // every time the ranking is changed we need to remove old orderings
if (post != null && post.containsKey("EnterDoublecheck")) {
Boost.RANKING.setMinTokenLen(post.getInt("minTokenLen", 3));
Boost.RANKING.setQuantRate(post.getFloat("quantRate", 0.5f));
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, Boost.RANKING.getMinTokenLen());
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, Boost.RANKING.getQuantRate());
}
if (post != null && post.containsKey("ResetDoublecheck")) {
Boost.RANKING.setMinTokenLen(3);
Boost.RANKING.setQuantRate(0.5f);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, Boost.RANKING.getMinTokenLen());
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, Boost.RANKING.getQuantRate());
}
if (post != null && post.containsKey("EnterRanking")) {
if (post != null && post.containsKey("EnterBoosts")) {
StringBuilder boostString = new StringBuilder(); // SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST;
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getKey().startsWith("boost")) {
String fieldName = entry.getKey().substring(6);
CollectionSchema field = CollectionSchema.valueOf(fieldName);
if (field == null) continue;
String fieldValue = entry.getValue();
if (fieldValue == null || fieldValue.length() == 0) continue;
try {
float boost = Float.parseFloat(entry.getValue());
float boost = Float.parseFloat(fieldValue);
if (boostString.length() > 0) boostString.append(',');
boostString.append(field.getSolrFieldName()).append('^').append(Float.toString(boost));
} catch (NumberFormatException e) {
@ -71,28 +60,75 @@ public class RankingSolr_p {
}
if (boostString.length() > 0) {
String s = boostString.toString();
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, s);
Boost.RANKING.updateBoosts(s);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + "0", s);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).updateBoosts(s);
}
}
if (post != null && post.containsKey("ResetBoosts")) {
String s = "text_t^2.0,url_paths_sxt^20.0,title^100.0,synonyms_sxt^1.0";
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + "0", s);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).updateBoosts(s);
}
if (post != null && post.containsKey("ResetRanking")) {
Boost.RANKING.initDefaults();
if (post != null && post.containsKey("EnterBQ")) {
String bq = post.get("bq");
if (bq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + "0", bq);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setBoostQuery(bq);
}
}
if (post != null && post.containsKey("ResetBQ")) {
String bq = "fuzzy_signature_unique_b:true^100000.0";
if (bq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + "0", bq);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setBoostQuery(bq);
}
}
if (post != null && post.containsKey("EnterBF")) {
String bf = post.get("bf");
String mode = post.get("mode");
if (bf != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + "0", bf);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTIONMODE_ + "0", mode);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setBoostFunction(bf);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setMode(Ranking.BoostFunctionMode.valueOf(mode));
}
}
if (post != null && post.containsKey("ResetBF")) {
String bf = ""; //"div(add(1,references_i),pow(add(1,inboundlinkscount_i),1.6))";
String mode = "add";
if (bf != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + "0", bf);
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTIONMODE_ + "0", mode);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setBoostFunction(bf);
sb.index.fulltext().getDefaultConfiguration().getRanking(0).setMode(Ranking.BoostFunctionMode.valueOf(mode));
}
}
final serverObjects prop = new serverObjects();
prop.put("minTokenLen", Boost.RANKING.getMinTokenLen());
prop.put("quantRate", Boost.RANKING.getQuantRate());
int i = 0;
for (Map.Entry<CollectionSchema, Float> entry: Boost.RANKING.entrySet()) {
CollectionSchema field = entry.getKey();
float boost = entry.getValue();
Ranking ranking = sb.index.fulltext().getDefaultConfiguration().getRanking(0);
for (SchemaDeclaration field: CollectionSchema.values()) {
if (!field.isSearchable()) continue;
prop.put("boosts_" + i + "_field", field.getSolrFieldName());
prop.put("boosts_" + i + "_boost", Float.toString(boost));
Float boost = ranking.getFieldBoost(field);
if (boost == null || boost.floatValue() <= 0.0f) {
prop.put("boosts_" + i + "_checked", 0);
prop.put("boosts_" + i + "_boost", "");
} else {
prop.put("boosts_" + i + "_checked", 1);
prop.put("boosts_" + i + "_boost", boost.toString());
}
i++;
}
prop.put("boosts", i);
prop.put("bq", ranking.getBoostQuery());
prop.put("bf", ranking.getBoostFunction());
prop.put("modeKey", ranking.getMethod() == Ranking.BoostFunctionMode.add ? "bf" : "boost");
prop.put("add.checked", ranking.getMethod() == Ranking.BoostFunctionMode.add ? 1 : 0);
prop.put("multiply.checked", ranking.getMethod() == Ranking.BoostFunctionMode.add ? 0 : 1);
return prop;
}

View File

@ -11,8 +11,8 @@
<li><a href="/yacyinteractive.html" class="MenuItemLink">File Search</a></li>
<li><a href="/HostBrowser.html?hosts=" class="MenuItemLink">Host Browser</a></li>
<!--<li><a href="/yacysearch_location.html" class="MenuItemLink">Location Search</a></li>-->
<li><a href="/solr/select?q=*:*&start=0&rows=3&core=collection1" class="MenuItemLink">Solr Default Core</a></li>
<li><a href="/solr/select?q=*:*&start=0&rows=3&core=webgraph" class="MenuItemLink">Solr Webgraph Core</a></li>
<li><a href="/solr/select?q=*:*&defType=edismax&start=0&rows=3&core=collection1" class="MenuItemLink">Solr Default Core</a></li>
<li><a href="/solr/select?q=*:*&defType=edismax&start=0&rows=3&core=webgraph" class="MenuItemLink">Solr Webgraph Core</a></li>
<li><a href="/gsa/search?q=www&size=3" class="MenuItemLink">Google Appliance API</a></li>
<!--<li><a href="/yacy/ui/" accesskey="s" class="MenuItemLink">Rich Client Search</a></li>-->
<li><a href="/compare_yacy.html?display=1" class="MenuItemLink">Compare Search</a></li>

View File

@ -4,8 +4,9 @@
<li><a href="/IndexControlURLs_p.html" class="MenuItemLink lock">URL Database Administration</a></li>
<li><a href="/IndexFederated_p.html" class="MenuItemLink lock">Index Sources &amp; Targets</a></li>
<li><a href="/IndexSchema_p.html" class="MenuItemLink lock">Solr Schema Editor</a></li>
#(p2p)#::<li><a href="/IndexControlRWIs_p.html" class="MenuItemLink lock">Reverse Word Index Administration</a></li>#(/p2p)#
#(p2p)#::<li><a href="/IndexControlRWIs_p.html" class="MenuItemLink lock">Reverse Word Index</a></li>#(/p2p)#
<!--<li><a href="/IndexControlCleaner_p.html" class="MenuItemLink lock">Index Cleaner</a></li>-->
<li><a href="/ContentAnalysis_p.html" class="MenuItemLink lock">Content Analysis</a></li>
<li><a href="/ConfigHTCache_p.html" class="MenuItemLink lock">Web Cache</a></li>
<li><a href="/ConfigParser.html" class="MenuItemLink lock">Parser Configuration</a></li>
</ul>

View File

@ -26,7 +26,7 @@ import java.util.ArrayList;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter;
import net.yacy.cora.protocol.HeaderFramework;
@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent;
@ -100,9 +99,6 @@ public class searchresult {
if (post == null) return null;
Log.logInfo("GSA Query", post.toString());
sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time
// update the boost values
Boost.RANKING.updateBoosts(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, ""));
// rename post fields according to result style
//post.put(CommonParams.Q, post.remove("q")); // same as solr
@ -114,13 +110,17 @@ public class searchresult {
// get a solr query string
QueryGoal qg = new QueryGoal(originalQuery, originalQuery);
StringBuilder solrQ = qg.solrQueryString(sb.index.fulltext().getDefaultConfiguration());
StringBuilder solrQ = qg.collectionQueryString(sb.index.fulltext().getDefaultConfiguration());
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("bq", Boost.RANKING.getBoostQuery()); // a boost query that moves double content to the back
post.put("bf", Boost.RANKING.getBoostFunction()); // a boost function extension
Ranking ranking = sb.index.fulltext().getDefaultConfiguration().getRanking(0);
String bq = ranking.getBoostQuery();
String bf = ranking.getBoostFunction();
if (bq.length() > 0) post.put("bq", bq); // a boost query that moves double content to the back
if (bf.length() > 0) post.put(ranking.getMethod() == Ranking.BoostFunctionMode.add ? "bf" : "boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
post.put(CommonParams.FL,
CollectionSchema.content_type.getSolrFieldName() + ',' +
CollectionSchema.id.getSolrFieldName() + ',' +

View File

@ -153,7 +153,7 @@ public class select {
querystring = modifier.parse(querystring);
modifier.apply(post);
QueryGoal qg = new QueryGoal(querystring, querystring);
StringBuilder solrQ = qg.solrQueryString(sb.index.fulltext().getDefaultConfiguration());
StringBuilder solrQ = qg.collectionQueryString(sb.index.fulltext().getDefaultConfiguration());
post.put(CommonParams.Q, solrQ.toString()); // sru patch
}
String q = post.get(CommonParams.Q, "");

View File

@ -188,7 +188,7 @@ public class yacysearchitem {
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay);
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString);
prop.put("content_ranking", result.ranking);
prop.put("content_ranking", result.ranking());
prop.put("content_showMetadata_urlhash", resulthashString);
prop.put("content_showCache_link", resultUrlstring);
prop.put("content_showProxy_link", resultUrlstring);

View File

@ -1,174 +0,0 @@
/**
* Search
* Copyright 2010 by Michael Peter Christen
* First released 25.05.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General private
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
public class SearchHub {
private static final String[] SRURSSServicesList = {
//"http://192.168.1.51:8000/yacysearch.rss"//,
"http://127.0.0.1:8008/yacysearch.rss"//,
/*
"http://yacy.dyndns.org:8000/yacysearch.rss",
"http://yacy.caloulinux.net:8085/yacysearch.rss",
"http://algire.dyndns.org:8085/yacysearch.rss",
"http://breyvogel.dyndns.org:8002/yacysearch.rss"*/
};
public final static SearchHub EMPTY = new SearchHub("", 0);
private final String query;
private final int timeout;
private final List<SearchAccumulator> threads;
private final Map<RSSMessage, List<Integer>> result;
public SearchHub(final String query, final int timeout) {
this.query = query;
this.timeout = timeout;
this.threads = new ArrayList<SearchAccumulator>();
this.result = new ConcurrentHashMap<RSSMessage, List<Integer>>();
}
/**
* get the result of the accumulation
* @return
*/
public Map<RSSMessage, List<Integer>> getAccumulation() {
return this.result;
}
/**
* add an accumulator to the list of accumulation theads.
* this is mainly used for awaitTermination() and isTerminated()
* @param a
*/
public void addAccumulator(final SearchAccumulator a) {
this.threads.add(a);
}
/**
* get the original query string
* @return
*/
public String getQuery() {
return this.query;
}
/**
* get the given time-out of the search request
* @return
*/
public int getTimeout() {
return this.timeout;
}
/**
* get the list of search results as scored map.
* The results are combined using their appearance positions.
* Every time this method is called the list is re-computed to reflect the latest results
* @return a score map of urls
*/
public ScoreMap<RSSMessage> getResults() {
final ScoreMap<RSSMessage> scores = new ConcurrentScoreMap<RSSMessage>();
final int m = this.threads.size();
for (final Map.Entry<RSSMessage, List<Integer>> entry: this.result.entrySet()) {
int a = 0;
for (final Integer i : entry.getValue()) a += i.intValue();
scores.inc(entry.getKey(), a * m / entry.getValue().size());
}
return scores;
}
/**
* wait until all accumulation threads have terminated
*/
public void waitTermination() {
for (final SearchAccumulator t: this.threads) try {t.join();} catch (final InterruptedException e) {}
}
/**
* return true if all accumulation threads have terminated
* @return
*/
public boolean isTerminated() {
for (final SearchAccumulator t: this.threads) if (t.isAlive()) return false;
return true;
}
/**
* return a hash code of the search hub.
* This is computed using only the query string because that identifies the object
*/
@Override
public int hashCode() {
return this.query.hashCode();
}
/**
* test method to add a list of SRU RSS services.
* such services are provided by YaCy peers
* @param search
* @param rssServices
* @param count
* @param verify
* @param global
*/
public static void addSRURSSServices(final SearchHub search, final String[] rssServices, final int count, final CacheStrategy verify, final boolean global, final String userAgent) {
for (final String service: rssServices) {
final SRURSSConnector accumulator = new SRURSSConnector(search, service, count, verify, global, userAgent);
accumulator.start();
search.addAccumulator(accumulator);
}
}
public static void main(final String[] args) {
HTTPClient.setDefaultUserAgent("searchhub");
HTTPClient.initConnectionManager();
final StringBuilder sb = new StringBuilder();
for (final String s: args) sb.append(s).append(' ');
final String query = sb.toString().trim();
final SearchHub search = new SearchHub(query, 10000);
addSRURSSServices(search, SRURSSServicesList, 100, CacheStrategy.CACHEONLY, false, "searchhub");
try {Thread.sleep(100);} catch (final InterruptedException e1) {}
search.waitTermination();
final ScoreMap<RSSMessage> result = search.getResults();
final Iterator<RSSMessage> i = result.keys(true);
RSSMessage u;
while (i.hasNext()) {
u = i.next();
System.out.println("[" + result.get(u) + "] " + u);
}
try {HTTPClient.closeConnectionManager();} catch (final InterruptedException e) { e.printStackTrace(); }
}
}

View File

@ -1,67 +0,0 @@
/**
* SearchResult
* Copyright 2011 by Michael Peter Christen
* First released 13.4.2011 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
public class SearchResult extends WeakPriorityBlockingQueue<Object> {
public SearchResult(final int maxsize) {
super(maxsize, true);
}
private static final long serialVersionUID = -4865225874936938082L;
private long numFound = 0;
private long start = 0;
private Float maxScore = null;
protected void setNumFound(final long numFound) {
this.numFound = numFound;
}
public long getNumFound() {
return this.numFound;
}
protected void setStart(final long start) {
this.start = start;
}
public long getStart() {
return this.start;
}
protected void setMaxScore(final Float maxScore) {
this.maxScore = maxScore;
}
public Float getMaxScore() {
return this.maxScore;
}
@Override
public String toString() {
return "{count=" + this.numFound + ", offset=" + this.start + (this.maxScore != null ? ", maxScore=" + this.maxScore : "") + ", docs=" + super.toString() + "}";
}
}

View File

@ -36,7 +36,6 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.SearchAccumulator;
import net.yacy.cora.federate.SearchHub;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.http.HTTPClient;
@ -78,24 +77,6 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
this.userAgent = userAgent;
}
public SRURSSConnector(
final SearchHub search,
final String urlBase,
final int maximumRecordsInit,
final CacheStrategy verify,
final boolean global,
final String userAgent) {
this.results = new LinkedBlockingQueue<RSSMessage>();
this.result = search.getAccumulation();
this.query = search.getQuery();
this.timeoutInit = search.getTimeout();
this.urlBase = urlBase;
this.maximumRecordsInit = maximumRecordsInit;
this.verify = verify;
this.global = global;
this.userAgent = userAgent;
}
@Override
public void run() {
searchSRURSS(this.results, this.urlBase, this.query, this.timeoutInit, this.maximumRecordsInit, this.verify, this.global, this.userAgent);

View File

@ -1,124 +0,0 @@
/**
* Boost
* Copyright 2012 by Michael Peter Christen
* First released 30.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
import java.util.LinkedHashMap;
import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema;
/**
* The Boost class is the solr ranking definition file. It contains boost values in a Linked HashMap; the 'linked'-Version is used
* to maintain the order of the arguments which shall be stable according to the iteration order within a configuration servlet.
* Because the order is influence by a double-check mechanismn the attributes to apply a document signature are also integrated
* into this class.
*/
public class Boost extends LinkedHashMap<CollectionSchema, Float> {
private static final long serialVersionUID = 5248172257724571603L;
public final static Boost RANKING = new Boost();
// for minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5!
private float quantRate = 0.5f; // to be filled with search.ranking.solr.doubledetection.quantrate
private int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength
private Boost() {
super();
this.initDefaults();
}
public void initDefaults() {
this.clear();
put(CollectionSchema.sku, 20.0f);
put(CollectionSchema.url_paths_sxt, 20.0f);
put(CollectionSchema.title, 15.0f);
put(CollectionSchema.h1_txt, 11.0f);
put(CollectionSchema.h2_txt, 10.0f);
put(CollectionSchema.author, 8.0f);
put(CollectionSchema.description, 5.0f);
put(CollectionSchema.keywords, 2.0f);
put(CollectionSchema.text_t, 1.0f);
put(CollectionSchema.synonyms_sxt, 0.9f);
put(CollectionSchema.references_i, 0.5f);
}
/**
* override the get method to return 1.0f for each non-resolvable object
*/
public Float get(Object field) {
Float boost = super.get(field);
if (boost == null) return 1.0f;
return boost;
}
/**
* the updateDef is a definition string that comes from a configuration file.
* It should be a comma-separated list of field^boost values
* This should be called with the field in search.ranking.solr.boost
* @param boostDef the definition string
*/
public void updateBoosts(String boostDef) {
// call i.e. with "sku^20.0f,url_paths_sxt^20.0f,title^15.0f,h1_txt^11.0f,h2_txt^10.0f,author^8.0f,description^5.0f,keywords^2.0f,text_t^1.0f,fuzzy_signature_unique_b^100000.0f"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
Float factor = Float.parseFloat(boost.substring(p + 1));
this.put(field, factor);
}
}
public void setQuantRate(float quantRate) {
this.quantRate = quantRate;
}
public void setMinTokenLen(int minTokenLen) {
this.minTokenLen = minTokenLen;
}
public float getQuantRate() {
return quantRate;
}
public int getMinTokenLen() {
return minTokenLen;
}
/**
* produce a string that can be added as a 'boost query' at the bq-attribute
* @return
*/
public String getBoostQuery() {
return CollectionSchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0f";
}
/**
* produce a boost function
* @return
*/
public String getBoostFunction() {
return "div(add(1,references_i),pow(add(1,inboundlinkscount_i),1.6))^0.4";
}
}

View File

@ -0,0 +1,150 @@
/**
* Ranking
* Copyright 2013 by Michael Peter Christen
* First released 12.03.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema;
/**
* The Ranking class is the solr ranking definition file for boosts and query functions.
*/
public class Ranking {
// for minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5!
private static float quantRate = 0.5f; // to be filled with search.ranking.solr.doubledetection.quantrate
private static int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength
public static enum BoostFunctionMode {
add, multiply;
}
private Map<SchemaDeclaration, Float> fieldBoosts;
private String name, boostQuery, boostFunction;
private BoostFunctionMode mode;
public Ranking() {
super();
this.name = "";
this.fieldBoosts = new LinkedHashMap<SchemaDeclaration, Float>();
this.boostQuery = "";
this.boostFunction = "";
this.mode = BoostFunctionMode.add;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public void putFieldBoost(SchemaDeclaration schema, float boost) {
this.fieldBoosts.put(schema, boost);
}
public Float getFieldBoost(SchemaDeclaration schema) {
return this.fieldBoosts.get(schema);
}
public Set<Map.Entry<SchemaDeclaration,Float>> getBoostMap() {
return this.fieldBoosts.entrySet();
}
/**
* the updateDef is a definition string that comes from a configuration file.
* It should be a comma-separated list of field^boost values
* This should be called with the field in search.ranking.solr.boost
* @param boostDef the definition string
*/
public void updateBoosts(String boostDef) {
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
this.fieldBoosts.clear();
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
}
}
public void setBoostQuery(String boostQuery) {
this.boostQuery = boostQuery;
}
/**
* produce a string that can be added as a 'boost query' at the bq-attribute
* @return
*/
public String getBoostQuery() {
return this.boostQuery;
}
public void setBoostFunction(String boostFunction) {
this.boostFunction = boostFunction;
}
/**
* produce a boost function
* @return
*/
public String getBoostFunction() {
return this.boostFunction;
}
public void setMode(BoostFunctionMode method) {
this.mode = method;
}
public BoostFunctionMode getMethod() {
return this.mode;
}
/*
* duplicate check static methods
*/
public static void setQuantRate(float newquantRate) {
quantRate = newquantRate;
}
public static void setMinTokenLen(int newminTokenLen) {
minTokenLen = newminTokenLen;
}
public static float getQuantRate() {
return quantRate;
}
public static int getMinTokenLen() {
return minTokenLen;
}
}

View File

@ -43,6 +43,8 @@ public interface SchemaDeclaration {
public boolean isMultiValued();
public boolean isSearchable();
public boolean isOmitNorms();
public String getComment();

View File

@ -90,6 +90,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
public void clearCache() {
for (HitMissCache c: hitMissCache.values()) c.clearCache();
this.documentCache.clear();
if (this.solr != null) this.solr.commit(true);
}
@Override

View File

@ -159,6 +159,7 @@ public class InstanceMirror {
public void clearCache() {
for (CachedSolrConnector csc: this.connectorCache.values()) csc.clearCache();
for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true);
}
}

View File

@ -45,7 +45,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
@ -238,8 +238,8 @@ public final class Condenser {
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
Map<String,String> sp = new HashMap<String,String>();
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
sp.put("quantRate", Float.toString(Ranking.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", Integer.toString(Ranking.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
fuzzySignatureFactory.add(text);
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();

View File

@ -55,16 +55,6 @@ import org.apache.solr.common.SolrDocument;
* Future implementations should try to replace URIMetadata objects completely by SolrDocument objects
*/
public class URIMetadataNode {
public static CollectionSchema[] fieldList = new CollectionSchema[]{
CollectionSchema.audiolinkscount_i, CollectionSchema.author, CollectionSchema.collection_sxt, CollectionSchema.content_type,
CollectionSchema.coordinate_p, CollectionSchema.description, CollectionSchema.fresh_date_dt, CollectionSchema.host_id_s, CollectionSchema.id,
CollectionSchema.imagescount_i, CollectionSchema.inboundlinks_protocol_sxt, CollectionSchema.inboundlinks_urlstub_txt,
CollectionSchema.inboundlinkscount_i, CollectionSchema.keywords, CollectionSchema.language_s, CollectionSchema.last_modified, CollectionSchema.load_date_dt,
CollectionSchema.md5_s, CollectionSchema.outboundlinks_protocol_sxt, CollectionSchema.outboundlinks_urlstub_txt,
CollectionSchema.outboundlinkscount_i, CollectionSchema.publisher_t, CollectionSchema.referrer_id_txt, CollectionSchema.size_i, CollectionSchema.sku,
CollectionSchema.text_t, CollectionSchema.title, CollectionSchema.title_words_val, CollectionSchema.url_chars_i,
CollectionSchema.videolinkscount_i, CollectionSchema.videolinkscount_i, CollectionSchema.wordcount_i};
private byte[] hash = null;
private String urlRaw = null, keywords = null;
@ -72,7 +62,7 @@ public class URIMetadataNode {
private Bitfield flags = null;
private int imagec = -1, audioc = -1, videoc = -1, appc = -1;
private double lat = Double.NaN, lon = Double.NaN;
private long ranking = -1; // during generation of a search result this value is set
private long ranking = 0; // during generation of a search result this value is set
private SolrDocument doc = null;
private String snippet = null;
private WordReferenceVars word = null; // this is only used if the url is transported via remote search requests
@ -81,7 +71,8 @@ public class URIMetadataNode {
this.doc = doc;
this.snippet = "";
this.word = null;
this.ranking = Long.MIN_VALUE;
Float score = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result
this.ranking = score == null ? 0 : (long) (1000000.0f * score.floatValue()); // solr score values are sometimes very low
this.hash = ASCII.getBytes(getString(CollectionSchema.id));
this.urlRaw = getString(CollectionSchema.sku);
try {

View File

@ -1053,6 +1053,8 @@ public final class Protocol {
solrQuery.setHighlightSnippets(1);
for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
solrQuery.setFields("*", "score"); // we need the score for post-ranking
boolean localsearch = target == null || target.equals(event.peers.mySeed());
if (localsearch && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_TESTLOCAL, false)) {
target = event.peers.mySeed();

View File

@ -96,7 +96,7 @@ import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
@ -409,11 +409,6 @@ public final class Switchboard extends serverSwitch {
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
// define boosts
Boost.RANKING.updateBoosts(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // must be called every time the boosts change
Boost.RANKING.setMinTokenLen(this.getConfigInt(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, 3));
Boost.RANKING.setQuantRate(this.getConfigFloat(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, 0.5f));
// prepare a solr index profile switch list
final File solrCollectionConfigurationInitFile = new File(getAppPath(), "defaults/" + SOLR_COLLECTION_CONFIGURATION_NAME);
final File solrCollectionConfigurationWorkFile = new File(getDataPath(), "DATA/SETTINGS/" + SOLR_COLLECTION_CONFIGURATION_NAME);
@ -421,19 +416,19 @@ public final class Switchboard extends serverSwitch {
final File solrWebgraphConfigurationWorkFile = new File(getDataPath(), "DATA/SETTINGS/" + SOLR_WEBGRAPH_CONFIGURATION_NAME);
CollectionConfiguration solrCollectionConfigurationWork = null;
WebgraphConfiguration solrWebgraphConfigurationWork = null;
// migrate the old Schema file path to a new one
final File solrCollectionConfigurationWorkOldFile = new File(getDataPath(), "DATA/SETTINGS/" + SOLR_COLLECTION_CONFIGURATION_NAME_OLD);
if (solrCollectionConfigurationWorkOldFile.exists() && !solrCollectionConfigurationWorkFile.exists()) solrCollectionConfigurationWorkOldFile.renameTo(solrCollectionConfigurationWorkFile);
// initialize the collection schema if it does not yet exist
if (!solrCollectionConfigurationWorkFile.exists()) try {
Files.copy(solrCollectionConfigurationInitFile, solrCollectionConfigurationWorkFile);
} catch (IOException e) {Log.logException(e);}
// lazy definition of schema: do not write empty fields
final boolean solrlazy = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_LAZY, true);
// define collection schema
try {
final CollectionConfiguration solrCollectionConfigurationInit = new CollectionConfiguration(solrCollectionConfigurationInitFile, solrlazy);
@ -467,6 +462,19 @@ public final class Switchboard extends serverSwitch {
solrWebgraphConfigurationWork.commit();
} catch (IOException e) {Log.logException(e);}
// define boosts
Ranking.setMinTokenLen(this.getConfigInt(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, 3));
Ranking.setQuantRate(this.getConfigFloat(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, 0.5f));
for (int i = 0; i <= 3; i++) {
// must be done every time the boosts change
Ranking r = solrCollectionConfigurationWork.getRanking(i);
r.setName(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTNAME_ + i, "_dummy" + i));
r.updateBoosts(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + i, "text_t^1.0"));
r.setBoostQuery(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, ""));
r.setBoostFunction(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, ""));
r.setMode(Ranking.BoostFunctionMode.valueOf(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTIONMODE_ + i, "add")));
}
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
@ -482,7 +490,7 @@ public final class Switchboard extends serverSwitch {
try {this.index.fulltext().connectLocalSolr();} catch (IOException e) {Log.logException(e);}
}
this.index.writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, false) & solrurls.length() > 0;

View File

@ -481,13 +481,20 @@ public final class SwitchboardConstants {
public static final String SEARCH_VERIFY_DELETE = "search.verify.delete";
/**
* ranking
* ranking+evaluation
*/
public static final String SEARCH_RANKING_RWI_PROFILE = "search.ranking.rwi.profile"; // old rwi rankingProfile ranking
public static final String SEARCH_RANKING_SOLR_BOOST = "search.ranking.solr.boost.tmp2"; // temporary until we know best default values
public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH = "search.ranking.solr.doubledetection.minlength";
public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE = "search.ranking.solr.doubledetection.quantrate";
/**
* boosts for different cores (add an number to the end of the property name)
*/
public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTNAME_ = "search.ranking.solr.collection.boostname.tmp."; // temporary until we know best default values; add the index number (0..3) to that string
public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ = "search.ranking.solr.collection.boostfields.tmp.";
public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ = "search.ranking.solr.collection.boostquery.tmp.";
public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ = "search.ranking.solr.collection.boostfunction.tmp.";
public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTIONMODE_ = "search.ranking.solr.collection.boostfunctionmode.tmp.";
/**
* system tray

View File

@ -27,7 +27,8 @@ import java.util.ArrayList;
import java.util.Map;
import java.util.SortedSet;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.parser.html.AbstractScraper;
@ -204,7 +205,7 @@ public class QueryGoal {
for (final byte[] b: blues) this.include_hashes.remove(b);
}
public StringBuilder solrQueryString(CollectionConfiguration configuration) {
public StringBuilder collectionQueryString(CollectionConfiguration configuration) {
final StringBuilder q = new StringBuilder(80);
// parse special requests
@ -231,15 +232,16 @@ public class QueryGoal {
// combine these queries for all relevant fields
wc = 0;
Float boost;
for (Map.Entry<CollectionSchema,Float> entry: Boost.RANKING.entrySet()) {
CollectionSchema field = entry.getKey();
if (entry.getValue().floatValue() < 0.0f) continue;
Ranking r = configuration.getRanking(0);
for (Map.Entry<SchemaDeclaration,Float> entry: r.getBoostMap()) {
SchemaDeclaration field = entry.getKey();
boost = entry.getValue();
if (boost == null || boost.floatValue() <= 0.0f) continue;
if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue;
if (field.getType() == SolrType.num_integer) continue;
if (wc > 0) q.append(" OR ");
q.append('(');
q.append(field.getSolrFieldName()).append(':').append(w);
boost = Boost.RANKING.get(field);
if (boost != null) q.append('^').append(boost.toString());
q.append(')');
wc++;

View File

@ -42,7 +42,7 @@ import org.apache.solr.client.solrj.SolrQuery.ORDER;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging;
@ -396,10 +396,13 @@ public final class QueryParams {
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(this.queryGoal.solrQueryString(this.indexSegment.fulltext().getDefaultConfiguration()).toString());
params.setQuery(this.queryGoal.collectionQueryString(this.indexSegment.fulltext().getDefaultConfiguration()).toString());
params.setParam("defType", "edismax");
params.setParam("bq", Boost.RANKING.getBoostQuery()); // a boost query that moves double content to the back
params.setParam("bf", Boost.RANKING.getBoostFunction()); // a boost function extension
Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(0);
String bq = ranking.getBoostQuery();
String bf = ranking.getBoostFunction();
if (bq.length() > 0) params.setParam("bq", bq); // a boost query that moves double content to the back
if (bf.length() > 0) params.setParam(ranking.getMethod() == Ranking.BoostFunctionMode.add ? "bf" : "boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);

View File

@ -156,6 +156,7 @@ public final class SearchEvent {
private final WeakPriorityBlockingQueue<WordReferenceVars> rwiStack; // thats the bag where the RWI search process writes to
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack; // thats the bag where the solr results are written to
private final WeakPriorityBlockingQueue<ResultEntry> resultList; // thats the result list where the actual search result is waiting to be displayed
private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source.
// the following values are filled during the search process as statistics for the search
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
@ -274,6 +275,7 @@ public final class SearchEvent {
if (this.remote) {
// start global searches
this.pollImmediately = false;
final long timer = System.currentTimeMillis();
if (this.query.getQueryGoal().getIncludeHashes().isEmpty()) {
this.primarySearchThreadsL = null;
@ -312,6 +314,7 @@ public final class SearchEvent {
}
} else {
this.primarySearchThreadsL = null;
this.pollImmediately = !query.getSegment().connectedRWI();
if ( generateAbstracts ) {
// we need the results now
try {
@ -824,7 +827,8 @@ public final class SearchEvent {
this.urlhashes.putUnique(iEntry.hash());
rankingtryloop: while (true) {
try {
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
long score = iEntry.ranking();
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, score == 0 ? this.order.cardinal(iEntry) : score)); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
@ -1168,10 +1172,10 @@ public final class SearchEvent {
*/
public void addResult(ResultEntry resultEntry) {
if (resultEntry == null) return;
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.order.cardinal(resultEntry.word()));
long ranking = resultEntry.ranking();
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
resultEntry.ranking = ranking;
this.resultList.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries.
this.addTopics(resultEntry);
}

View File

@ -31,7 +31,6 @@ import java.util.Map;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
@ -142,7 +141,6 @@ public class SearchEventCache {
// start a new event
Switchboard sb = Switchboard.getSwitchboard();
final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true);
if (sb != null) Boost.RANKING.updateBoosts(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // update the boost values
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete);
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}

View File

@ -90,18 +90,18 @@ public class RankingProfile {
public RankingProfile(final Classification.ContentDomain mediatype) {
// set default-values
this.coeff_appemph = 5;
this.coeff_appurl = 11;
this.coeff_appurl = 12;
this.coeff_app_dc_creator = 1;
this.coeff_app_dc_description = 8;
this.coeff_app_dc_description = 10;
this.coeff_app_dc_subject = 2;
this.coeff_app_dc_title = 12;
this.coeff_app_dc_title = 14;
this.coeff_authority = 5;
this.coeff_cathasapp = (mediatype == ContentDomain.APP) ? 15 : 0;
this.coeff_cathasaudio = (mediatype == ContentDomain.AUDIO) ? 15 : 0;
this.coeff_cathasimage = (mediatype == ContentDomain.IMAGE) ? 15 : 0;
this.coeff_cathasvideo = (mediatype == ContentDomain.VIDEO) ? 15 : 0;
this.coeff_catindexof = (mediatype == ContentDomain.TEXT) ? 0 : 15;
this.coeff_date = 7;
this.coeff_date = 9;
this.coeff_domlength = 10;
this.coeff_hitcount = 1;
this.coeff_language = 2;
@ -111,18 +111,18 @@ public class RankingProfile {
this.coeff_posinphrase = 0;
this.coeff_posintext = 4;
this.coeff_posofphrase = 0;
this.coeff_termfrequency = 14;
this.coeff_termfrequency = 8;
this.coeff_urlcomps = 7;
this.coeff_urllength = 6;
this.coeff_worddistance = 10;
this.coeff_wordsintext = 3;
this.coeff_wordsintitle = 2;
this.coeff_ybr = 8;
this.coeff_ybr = 0;
this.coeff_urlcompintoplist = 2;
this.coeff_descrcompintoplist = 2;
this.coeff_prefer = 0;
this.coeff_citation = 15;
this.coeff_citation = 10;
}
public RankingProfile(final String prefix, String profile) {

View File

@ -43,6 +43,7 @@ import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.ProcessType;
@ -77,14 +78,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
private static final long serialVersionUID=-499100932212840385L;
/**
* initialize with an empty ConfigurationSet which will cause that all the index
* attributes are used
*/
public CollectionConfiguration() {
super();
}
private final ArrayList<Ranking> rankings;
/**
* initialize the schema with a given configuration file
@ -96,6 +90,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public CollectionConfiguration(final File configurationFile, boolean lazy) throws IOException {
super(configurationFile);
super.lazy = lazy;
this.rankings = new ArrayList<Ranking>(4);
for (int i = 0; i <= 3; i++) rankings.add(new Ranking());
// check consistency: compare with YaCyField enum
if (this.isEmpty()) return;
Iterator<Entry> it = this.entryIterator();
@ -118,6 +114,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
}
public Ranking getRanking(int idx) {
return this.rankings.get(idx);
}
public Ranking getRanking(String name) {
if (name == null) return null;
for (int i = 0; i < this.rankings.size(); i++) {
Ranking r = this.rankings.get(i);
if (name.equals(r)) return r;
}
return null;
}
/**
* save configuration to file and update enum SolrFields
@ -335,7 +344,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String docurl = digestURI.toNormalform(true);
add(doc, CollectionSchema.sku, docurl);
int clickdepth = -1;
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) {
if (digestURI.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
@ -344,9 +353,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} else {
// search the citations for references
//try {
clickdepth = -1; //getClickDepth(citations, digestURI);
clickdepth = 999; //getClickDepth(citations, digestURI);
//} catch (IOException e) {
// add(doc, CollectionSchema.clickdepth_i, -1);
// add(doc, CollectionSchema.clickdepth_i, 999);
//}
if (clickdepth < 0 || clickdepth > 1) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
@ -616,7 +625,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// Frames
if (allAttr || contains(CollectionSchema.frames_txt)) {
if (allAttr || contains(CollectionSchema.frames_sxt)) {
final Set<DigestURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
@ -626,11 +635,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
frames[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) add(doc, CollectionSchema.frames_txt, frames);
if (frames.length > 0) add(doc, CollectionSchema.frames_sxt, frames);
}
// IFrames
if (allAttr || contains(CollectionSchema.iframes_txt)) {
if (allAttr || contains(CollectionSchema.iframes_sxt)) {
final Set<DigestURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
@ -640,16 +649,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
iframes[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) add(doc, CollectionSchema.iframes_txt, iframes);
if (iframes.length > 0) add(doc, CollectionSchema.iframes_sxt, iframes);
}
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_t)) {
if (allAttr || contains(CollectionSchema.canonical_s)) {
final DigestURI canonical = html.getCanonical();
if (canonical != null) {
inboundLinks.remove(canonical);
outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_t, canonical.toNormalform(false));
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b) && canonical.equals(docurl)) {
add(doc, CollectionSchema.canonical_equal_sku_b, true);
@ -765,7 +774,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
*/
public static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
@ -817,7 +826,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
levelhashes = checknext;
}
return -1;
return 999;
}
/**

View File

@ -31,72 +31,72 @@ import org.apache.solr.common.SolrInputDocument;
public enum CollectionSchema implements SchemaDeclaration {
// mandatory
id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
last_modified(SolrType.date, true, true, false, "last-modified from http header"),
content_type(SolrType.string, true, true, true, "mime-type of document"),
title(SolrType.text_general, true, true, true, "content of title tag"),
title_unique_b(SolrType.bool, true, true, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),
host_id_s(SolrType.string, true, true, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string, true, true, false, "the md5 of the raw source"),// String md5();
exact_signature_l(SolrType.num_long, true, true, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
exact_signature_unique_b(SolrType.bool, true, true, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"),
fuzzy_signature_l(SolrType.num_long, true, true, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"),
fuzzy_signature_text_t(SolrType.text_general, true, true, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool, true, true, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
references_i(SolrType.num_integer, true, true, false, "number of unique http references; used for ranking"),
clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, "needed (post-)processing steps on this metadata set"),
id(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
title(SolrType.text_general, true, true, true, false, true, "content of title tag"),
title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),
host_id_s(SolrType.string, true, true, false, false, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5();
exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
exact_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"),
fuzzy_signature_l(SolrType.num_long, true, true, false, false, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"),
fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size();
failreason_t(SolrType.text_general, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references; used for ranking"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, false, "date until resource shall be considered as fresh"),
referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
publisher_t(SolrType.text_general, true, true, false, "the name of the publisher of the document"),// String dc_publisher();
language_s(SolrType.string, true, true, false, "the language used in the document"),// byte[] language();
audiolinkscount_i(SolrType.num_integer, true, true, false, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.num_integer, true, true, false, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.num_integer, true, true, false, "number of links to application resources"),// int lapp();
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"),
referrer_id_txt(SolrType.string, true, true, true, false, false, "ids of referrer to this document"),// byte[] referrerHash();
publisher_t(SolrType.text_general, true, true, false, false, true, "the name of the publisher of the document"),// String dc_publisher();
language_s(SolrType.string, true, true, false, false, false, "the language used in the document"),// byte[] language();
audiolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to application resources"),// int lapp();
// optional but recommended
coordinate_p(SolrType.location, true, true, false, "point in degrees of latitude,longitude as declared in WSG84"),
coordinate_p_0_coordinate(SolrType.coordinate, true, true, false, "automatically created subfield, (latitude)"),
coordinate_p_1_coordinate(SolrType.coordinate, true, true, false, "automatically created subfield, (longitude)"),
ip_s(SolrType.string, true, true, false, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, false, "content of author-tag"),
author_sxt(SolrType.string, true, true, true, "content of author-tag as copy-field from author. This is used for facet generation"),
description(SolrType.text_general, true, true, false, "content of description-tag"),
description_unique_b(SolrType.bool, true, true, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general, true, true, false, "content of keywords tag; words are separated by space"),
charset_s(SolrType.string, true, true, false, "character encoding"),
wordcount_i(SolrType.num_integer, true, true, false, "number of words in visible area"),
inboundlinkscount_i(SolrType.num_integer, true, true, false, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, "number of inbound links with nofollow tag"),
outboundlinkscount_i(SolrType.num_integer, true, true, false, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, "number of external links with nofollow tag"),
imagescount_i(SolrType.num_integer, true, true, false, "number of images"),
responsetime_i(SolrType.num_integer, true, true, false, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, false, "all visible text"),
synonyms_sxt(SolrType.string, true, true, true, "additional synonyms to the words in the text"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
h4_txt(SolrType.text_general, true, true, true, "h4 header"),
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
coordinate_p(SolrType.location, true, true, false, false, false, "point in degrees of latitude,longitude as declared in WSG84"),
coordinate_p_0_coordinate(SolrType.coordinate, true, true, false, false, false, "automatically created subfield, (latitude)"),
coordinate_p_1_coordinate(SolrType.coordinate, true, true, false, false, false, "automatically created subfield, (longitude)"),
ip_s(SolrType.string, true, true, false, false, false, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, false, false, true, "content of author-tag"),
author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"),
description(SolrType.text_general, true, true, false, false, true, "content of description-tag"),
description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"),
charset_s(SolrType.string, true, true, false, false, false, "character encoding"),
wordcount_i(SolrType.num_integer, true, true, false, false, false, "number of words in visible area"),
inboundlinkscount_i(SolrType.num_integer, true, true, false, false, false, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, false, false, "number of inbound links with nofollow tag"),
outboundlinkscount_i(SolrType.num_integer, true, true, false, false, false, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, false, false, "number of external links with nofollow tag"),
imagescount_i(SolrType.num_integer, true, true, false, false, false, "number of images"),
responsetime_i(SolrType.num_integer, true, true, false, false, false, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, false, false, true, "all visible text"),
synonyms_sxt(SolrType.string, true, true, true, false, true, "additional synonyms to the words in the text"),
h1_txt(SolrType.text_general, true, true, true, false, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, false, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, false, true, "h3 header"),
h4_txt(SolrType.text_general, true, true, true, false, true, "h4 header"),
h5_txt(SolrType.text_general, true, true, true, false, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, false, true, "h6 header"),
// optional values, not part of standard YaCy handling (but useful for external applications)
collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
csscount_i(SolrType.num_integer, true, true, false, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, "number of entries in scripts_txt"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
csscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, false, false, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_txt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
@ -107,104 +107,89 @@ public enum CollectionSchema implements SchemaDeclaration {
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.num_integer, true, true, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
/*
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_val(SolrType.num_integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
inboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of characters"),
inboundlinks_text_words_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of words"),
inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_val(SolrType.num_integer, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
outboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of characters"),
outboundlinks_text_words_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of words"),
outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"),
*/
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
images_withalt_i(SolrType.num_integer, true, true, false, "number of image links with alt tag"),
htags_i(SolrType.num_integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"),
canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"),
canonical_equal_sku_b(SolrType.bool, true, true, false, "flag shows if the url in canonical_t is equal to sku"),
refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
licount_i(SolrType.num_integer, true, true, false, "number of <li> tags"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.num_integer, true, true, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italiccount_i(SolrType.num_integer, true, true, false, "total number of occurrences of <i>"),
underline_txt(SolrType.text_general, true, true, true, "all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
underlinecount_i(SolrType.num_integer, true, true, false, "total number of occurrences of <u>"),
flash_b(SolrType.bool, true, true, false, "flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.num_integer, true, true, false, "number of frames_txt"),
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.num_integer, true, true, false, "number of iframes_txt"),
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "internal links, the url only without the protocol"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, false, false, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, false, false, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
images_tag_txt(SolrType.text_general, true, true, true, false, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, false, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general, true, true, true, false, false, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"),
images_withalt_i(SolrType.num_integer, true, true, false, false, false, "number of image links with alt tag"),
htags_i(SolrType.num_integer, true, true, false, false, false, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(SolrType.string, true, true, false, false, false, "url inside the canonical link element"),
canonical_equal_sku_b(SolrType.bool, true, true, false, false, false, "flag shows if the url in canonical_t is equal to sku"),
refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, false, true, "all texts in <li> tags"),
licount_i(SolrType.num_integer, true, true, false, false, false, "number of <li> tags"),
bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italiccount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <i>"),
underline_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
underlinecount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <u>"),
flash_b(SolrType.bool, true, true, false, false, false, "flag that shows if a swf file is linked"),
frames_sxt(SolrType.string, true, true, true, false, false, "list of all links to frames"),
framesscount_i(SolrType.num_integer, true, true, false, false, false, "number of frames_txt"),
iframes_sxt(SolrType.string, true, true, true, false, false, "list of all links to iframes"),
iframesscount_i(SolrType.num_integer, true, true, false, false, false, "number of iframes_txt"),
url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"),
url_file_ext_s(SolrType.string, true, true, false, "the file name extension"),
url_parameter_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"),
url_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url"),
url_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url == length of sku field"),
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),
url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url"),
url_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url"),
url_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url == length of sku field"),
host_s(SolrType.string, true, true, false, "host of the url"),
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
host_s(SolrType.string, true, true, false, false, true, "host of the url"),
host_dnc_s(SolrType.string, true, true, false, false, true, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, false, true, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, false, true, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, false, true, "the remaining part of the host without organizationdnc"),
title_count_i(SolrType.num_integer, true, true, false, "number of titles (counting the 'title' field) in the document"),
title_chars_val(SolrType.num_integer, true, true, true, "number of characters for each title"),
title_words_val(SolrType.num_integer, true, true, true, "number of words in each title"),
title_count_i(SolrType.num_integer, true, true, false, false, false, "number of titles (counting the 'title' field) in the document"),
title_chars_val(SolrType.num_integer, true, true, true, false, false, "number of characters for each title"),
title_words_val(SolrType.num_integer, true, true, true, false, false, "number of words in each title"),
description_count_i(SolrType.num_integer, true, true, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
description_chars_val(SolrType.num_integer, true, true, true, "number of characters for each description"),
description_words_val(SolrType.num_integer, true, true, true, "number of words in each description"),
description_count_i(SolrType.num_integer, true, true, false, false, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
description_chars_val(SolrType.num_integer, true, true, true, false, false, "number of characters for each description"),
description_words_val(SolrType.num_integer, true, true, true, false, false, "number of words in each description"),
h1_i(SolrType.num_integer, true, true, false, "number of h1 header lines"),
h2_i(SolrType.num_integer, true, true, false, "number of h2 header lines"),
h3_i(SolrType.num_integer, true, true, false, "number of h3 header lines"),
h4_i(SolrType.num_integer, true, true, false, "number of h4 header lines"),
h5_i(SolrType.num_integer, true, true, false, "number of h5 header lines"),
h6_i(SolrType.num_integer, true, true, false, "number of h6 header lines"),
h1_i(SolrType.num_integer, true, true, false, false, false, "number of h1 header lines"),
h2_i(SolrType.num_integer, true, true, false, false, false, "number of h2 header lines"),
h3_i(SolrType.num_integer, true, true, false, false, false, "number of h3 header lines"),
h4_i(SolrType.num_integer, true, true, false, false, false, "number of h4 header lines"),
h5_i(SolrType.num_integer, true, true, false, false, false, "number of h5 header lines"),
h6_i(SolrType.num_integer, true, true, false, false, false, "number of h6 header lines"),
schema_org_breadcrumb_i(SolrType.num_integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),
opengraph_title_t(SolrType.text_general, true, true, false, "Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"),
opengraph_type_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"),
opengraph_url_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
schema_org_breadcrumb_i(SolrType.num_integer, true, true, false, false, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),
opengraph_title_t(SolrType.text_general, true, true, false, false, true, "Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"),
opengraph_type_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"),
opengraph_url_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general, true, true, false, false, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in italic_txt"),
underline_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in underline_txt"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.num_integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
ext_ads_val(SolrType.num_integer, true, true, true, "number of attributes counts in ext_ads_txt"),
ext_community_txt(SolrType.text_general, true, true, true, "names of recognized community functions"),
ext_community_val(SolrType.num_integer, true, true, true, "number of attribute counts in attr_community"),
ext_maps_txt(SolrType.text_general, true, true, true, "names of map services"),
ext_maps_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_maps_txt"),
ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions");
bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"),
underline_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in underline_txt"),
ext_cms_txt(SolrType.text_general, true, true, true, false, false, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.num_integer, true, true, true, false, false, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, false, false, "names of ad-servers/ad-services"),
ext_ads_val(SolrType.num_integer, true, true, true, false, false, "number of attributes counts in ext_ads_txt"),
ext_community_txt(SolrType.text_general, true, true, true, false, false, "names of recognized community functions"),
ext_community_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in attr_community"),
ext_maps_txt(SolrType.text_general, true, true, true, false, false, "names of map services"),
ext_maps_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_maps_txt"),
ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"),
ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"),
ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions");
public final static String CORE_NAME = "collection1";
@ -213,23 +198,17 @@ public enum CollectionSchema implements SchemaDeclaration {
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;
private final boolean indexed, stored;
private boolean multiValued, omitNorms;
private final boolean indexed, stored, searchable, multiValued, omitNorms;
private String comment;
private CollectionSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
private CollectionSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final boolean searchable, final String comment) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = multiValued;
this.omitNorms = false;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
}
private CollectionSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
this(type, indexed, stored, multiValued, comment);
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
}
@ -280,6 +259,11 @@ public enum CollectionSchema implements SchemaDeclaration {
return this.omitNorms;
}
@Override
public final boolean isSearchable() {
return this.searchable;
}
@Override
public final String getComment() {
return this.comment;

View File

@ -31,65 +31,65 @@ import org.apache.solr.common.SolrInputDocument;
public enum WebgraphSchema implements SchemaDeclaration {
// index organisation
id(SolrType.string, true, true, false, "primary key of document, a combination of <source-url-hash><target-url-hash><four-digit-hex-counter> (28 characters)"),
last_modified(SolrType.date, true, true, false, "last-modified from http header"),
load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
id(SolrType.string, true, true, false, false, false, "primary key of document, a combination of <source-url-hash><target-url-hash><four-digit-hex-counter> (28 characters)"),
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
// source information
source_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (source)"),
source_protocol_s(SolrType.string, true, true, false, "the protocol of the url (source)"),
source_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (source)"),
source_file_ext_s(SolrType.string, true, true, false, "the file name extension (source)"),
source_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (source)"),
source_path_s(SolrType.string, true, true, false, "path of the url (source)"),
source_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (source)"),
source_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (source)"),
source_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
source_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (source)"),
source_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (source)"),
source_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension (source)"),
source_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (source)"),
source_path_s(SolrType.string, true, true, false, false, false, "path of the url (source)"),
source_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url (source)"),
source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_host_s(SolrType.string, true, true, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, "id of the host (source)"),
source_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"),
source_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
source_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (source)"),
source_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (source)"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
source_host_dnc_s(SolrType.string, true, true, false, false, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"),
source_host_organization_s(SolrType.string, true, true, false, false, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
source_host_organizationdnc_s(SolrType.string, true, true, false, false, false, "the organization and dnc concatenated with '.' (source)"),
source_host_subdomain_s(SolrType.string, true, true, false, false, false, "the remaining part of the host without organizationdnc (source)"),
// information in the source about the target
target_linktext_t(SolrType.text_general, true, true, false, "the text content of the a-tag (in source, but pointing to a target)"),
target_linktext_charcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_linktext_wordcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"),
target_alt_t(SolrType.text_general, true, true, false, "if the link is an image link, this contains the alt tag if the image is also liked as img link (in source, but pointing to a target)"),
target_alt_charcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_alt_wordcount_i(SolrType.num_integer, true, true, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"),
target_name_t(SolrType.text_general, true, true, false, "the name property of the a-tag (in source, but pointing to a target)"),
target_rel_s(SolrType.string, true, true, false, "the rel property of the a-tag (in source, but pointing to a target)"),
target_relflags_i(SolrType.num_integer, true, true, false, "the rel property of the a-tag, coded binary (in source, but pointing to a target)"),
target_linktext_t(SolrType.text_general, true, true, false, false, true, "the text content of the a-tag (in source, but pointing to a target)"),
target_linktext_charcount_i(SolrType.num_integer, true, true, false, false, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_linktext_wordcount_i(SolrType.num_integer, true, true, false, false, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"),
target_alt_t(SolrType.text_general, true, true, false, false, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link (in source, but pointing to a target)"),
target_alt_charcount_i(SolrType.num_integer, true, true, false, false, false, "the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_alt_wordcount_i(SolrType.num_integer, true, true, false, false, false, "the length of the a-tag content text as number of words (in source, but pointing to a target)"),
target_name_t(SolrType.text_general, true, true, false, false, true, "the name property of the a-tag (in source, but pointing to a target)"),
target_rel_s(SolrType.string, true, true, false, false, false, "the rel property of the a-tag (in source, but pointing to a target)"),
target_relflags_i(SolrType.num_integer, true, true, false, false, false, "the rel property of the a-tag, coded binary (in source, but pointing to a target)"),
// target information
target_id_s(SolrType.string, true, true, false, "primary key of document, the URL hash (target)"),
target_protocol_s(SolrType.string, true, true, false, "the protocol of the url (target)"),
target_urlstub_s(SolrType.string, true, true, false, "the url without the protocol (target)"),
target_file_ext_s(SolrType.string, true, true, false, "the file name extension (target)"),
target_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url (target)"),
target_path_s(SolrType.string, true, true, false, "path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer, true, true, false, "count of all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string, true, true, true, "all path elements in the url (target)"),
target_parameter_count_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"),
target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"),
target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"),
target_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension (target)"),
target_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (target)"),
target_path_s(SolrType.string, true, true, false, false, false, "path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url (target)"),
target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_host_s(SolrType.string, true, true, false, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, "id of the host (target)"),
target_host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"),
target_host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain (target)"),
target_host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.' (target)"),
target_host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc (target)"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),
target_host_dnc_s(SolrType.string, true, true, false, false, true, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"),
target_host_organization_s(SolrType.string, true, true, false, false, true, "either the second level domain or, if a ccSLD is used, the third level domain (target)"),
target_host_organizationdnc_s(SolrType.string, true, true, false, false, true, "the organization and dnc concatenated with '.' (target)"),
target_host_subdomain_s(SolrType.string, true, true, false, false, true, "the remaining part of the host without organizationdnc (target)"),
target_inbound_b(SolrType.bool, true, true, false, "flag shows if the target host is equal to the source host");
target_inbound_b(SolrType.bool, true, true, false, false, false, "flag shows if the target host is equal to the source host");
public final static String CORE_NAME = "webgraph";
@ -98,23 +98,18 @@ public enum WebgraphSchema implements SchemaDeclaration {
private String solrFieldName = null; // solr field name in custom solr schema
private final SolrType type;
private final boolean indexed, stored;
private boolean multiValued, omitNorms;
private final boolean indexed, stored, multiValued, omitNorms, searchable;
private String comment;
private WebgraphSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
private WebgraphSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final boolean searchable, final String comment) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = multiValued;
this.omitNorms = false;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
}
private WebgraphSchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
this(type, indexed, stored, multiValued, comment);
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
}
@ -165,6 +160,11 @@ public enum WebgraphSchema implements SchemaDeclaration {
return this.omitNorms;
}
@Override
public final boolean isSearchable() {
return this.searchable;
}
@Override
public final String getComment() {
return this.comment;

View File

@ -60,7 +60,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
private final Segment indexSegment;
// statistic objects
public long snippetComputationTime, ranking;
public long snippetComputationTime;
public ResultEntry(final URIMetadataNode urlentry,
final Segment indexSegment,
@ -219,4 +219,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
public int compare(ResultEntry o1, ResultEntry o2) {
return Base64Order.enhancedCoder.compare(o1.urlentry.hash(), o2.urlentry.hash());
}
public long ranking() {
return this.urlentry.ranking();
}
}