yacy_search_server/htroot/RankingSolr_p.java

231 lines
12 KiB
Java
Raw Normal View History

2012-12-05 12:26:42 +01:00
/**
* RankingSolr_p
* Copyright 2012 by Michael Peter Christen
* First released 30.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.schema.CollectionConfiguration;
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams;
public class RankingSolr_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
// clean up all search events
SearchEventCache.cleanupEvents(true);
2013-11-07 21:30:17 +01:00
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
int profileNr = 0;
if (post != null) profileNr = post.getInt("profileNr", profileNr);
if (post != null && post.containsKey("EnterBoosts")) {
StringBuilder boostString = new StringBuilder(); // SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST;
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getKey().startsWith("boost")) {
String fieldName = entry.getKey().substring(6);
introduced a second core named 'webgraph'. This core will hold the link structure, but is not filled yet. To have the opportunity of a second core, multi-core functionality had to be implemented to the deep-embedded solr: - migrated the solr_40 directory content to a subdirectory 'collection1'; the previously used default core is now called collection1 - added solr_40/webgraph subdirectory as second core - added a servlet configuration for the second core 'webgraph' in /IndexSchema_p.html - added instance handling as addition to solr connections: all solr connectors are now instances of an solr 'instance' object; this required a complete re-design of the solr embedding - migrated also caching and sharding ontop of new instance handling - migrated the search apis to handle now the access to a specific core, the default core named 'collection1' - migrated the remote solr search interface to access shards of cores; for the yacy remote search the default core is now called 'solr'; using the peer address as solr address - migrated the solr backup and restore process: old backups cannot be used after this migration! - redesign of solr instance handling in all methods which access the instances: they cannot hold copies of these instances any more; the must retrieve the actuall connection object every time they want to write to it (this solves also some bugs when switching the index/network) - added another schema 'solr.webgraph.schema', the old solr.keys.list is replaced by solr.collection.schema
2013-02-21 13:23:55 +01:00
CollectionSchema field = CollectionSchema.valueOf(fieldName);
if (field == null) continue;
String fieldValue = entry.getValue();
if (fieldValue == null || fieldValue.length() == 0) continue;
try {
float boost = Float.parseFloat(fieldValue);
if (boost > 0.0f) { // don't allow <= 0
if (boostString.length() > 0) boostString.append(',');
boostString.append(field.getSolrFieldName()).append('^').append(Float.toString(boost));
}
} catch (final NumberFormatException e) {
continue;
}
}
}
if (boostString.length() > 0) {
String s = boostString.toString();
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + profileNr, s);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).updateBoosts(s);
}
}
if (post != null && post.containsKey("ResetBoosts")) {
String s = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^3.0,keywords^2.0,description_txt^1.5,author^1.0";
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + profileNr, s);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).updateBoosts(s);
}
if (post != null && post.containsKey("EnterBQ")) {
String bq = post.get(DisMaxParams.BQ);
if (bq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq);
}
}
if (post != null && post.containsKey("ResetBQ")) {
String bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
if (bq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq);
}
}
if (post != null && post.containsKey("EnterFQ")) {
String fq = post.get(CommonParams.FQ);
if (fq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + profileNr, fq);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setFilterQuery(fq);
}
}
if (post != null && post.containsKey("ResetFQ")) {
String fq = ""; // i.e. "http_unique_b:true AND www_unique_b:true"
if (fq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + profileNr, fq);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setFilterQuery(fq);
}
}
if (post != null && post.containsKey("EnterBF")) {
String bf = post.get(DisMaxParams.BF);
if (bf != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + profileNr, bf);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostFunction(bf);
}
}
if (post != null && post.containsKey("ResetBF")) {
String bf = "";
if (bf != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + profileNr, bf);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostFunction(bf);
}
}
final serverObjects prop = new serverObjects();
int i = 0;
CollectionConfiguration colcfg = sb.index.fulltext().getDefaultConfiguration();
Ranking ranking = colcfg.getRanking(profileNr);
for (SchemaDeclaration field: CollectionSchema.values()) {
if (!field.isSearchable()) continue;
Float boost = ranking.getFieldBoost(field);
if (boost != null || colcfg.contains(field)) { // show only available or configured boost fields
prop.put("boosts_" + i + "_field", field.getSolrFieldName());
if (boost == null || boost.floatValue() <= 0.0f) {
prop.put("boosts_" + i + "_checked", 0);
prop.put("boosts_" + i + "_boost", "");
prop.put("boosts_" + i + "_notinindexwarning", "0");
} else {
prop.put("boosts_" + i + "_checked", 1);
prop.put("boosts_" + i + "_boost", boost.toString());
prop.put("boosts_" + i + "_notinindexwarning", (colcfg.contains(field.name()) ? "0" : "1"));
}
prop.putHTML("boosts_" + i + "_comment", field.getComment());
i++;
}
}
prop.put("boosts", i);
prop.put(CommonParams.FQ, ranking.getFilterQuery());
prop.put(DisMaxParams.BQ, ranking.getBoostQuery());
prop.put(DisMaxParams.BF, ranking.getBoostFunction());
for (int j = 0; j < 4; j++) {
prop.put("profiles_" + j + "_nr", j);
prop.put("profiles_" + j + "_name", sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTNAME_ + j, "N.N."));
prop.put("profiles_" + j + "_selected", profileNr == j ? 1 : 0);
}
prop.put("profiles", 4);
prop.put("profileNr", profileNr);
// make boost hints for vocabularies
Map<String, ReversibleScoreMap<String>> vocabularyFacet;
try {
vocabularyFacet = sb.index.fulltext().getDefaultConnector().getFacets(CollectionSchema.vocabularies_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 100, CollectionSchema.vocabularies_sxt.getSolrFieldName());
} catch (IOException e) {
ConcurrentLog.logException(e);
vocabularyFacet = new HashMap<>();
}
if (vocabularyFacet.size() == 0) {
prop.put("boosthint", 0);
} else {
prop.put("boosthint", 1);
prop.putHTML("boosthint_vocabulariesfield", CollectionSchema.vocabularies_sxt.getSolrFieldName());
ReversibleScoreMap<String> vokcounts = vocabularyFacet.values().iterator().next();
Collection<String> vocnames = vokcounts.keyList(true);
prop.putHTML("boosthint_vocabulariesavailable", vocnames.toString());
ArrayList<String> voccountFields = new ArrayList<>();
ArrayList<String> voclogcountFields = new ArrayList<>();
ArrayList<String> voclogcountsFields = new ArrayList<>();
ArrayList<String> ff = new ArrayList<>();
for (String vocname: vocnames) {
voccountFields.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_COUNT_SUFFIX);
voclogcountFields.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX);
voclogcountsFields.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX);
}
ff.addAll(voclogcountFields);
ff.addAll(voclogcountsFields);
prop.putHTML("boosthint_vocabulariesvoccount", voccountFields.toString());
prop.putHTML("boosthint_vocabulariesvoclogcount", voclogcountFields.toString());
prop.putHTML("boosthint_vocabulariesvoclogcounts", voclogcountsFields.toString());
String[] facetfields = ff.toArray(new String[ff.size()]);
int fc = 0;
if (facetfields.length > 0) try {
LinkedHashMap<String, ReversibleScoreMap<String>> facets = sb.index.fulltext().getDefaultConnector().getFacets("*:*", 100, facetfields);
facets.put(CollectionSchema.vocabularies_sxt.getSolrFieldName(), vokcounts);
for (Map.Entry<String, ReversibleScoreMap<String>> facetentry: facets.entrySet()) {
ReversibleScoreMap<String> facetfieldmap = facetentry.getValue();
if (facetfieldmap.size() == 0) continue;
TreeMap<String, Integer> statMap = new TreeMap<>();
for (String k: facetfieldmap) statMap.put(k, facetfieldmap.get(k));
prop.put("boosthint_facets_" + fc + "_facetname", facetentry.getKey());
int c = 0; for (Entry<String, Integer> entry: statMap.entrySet()) {
prop.put("boosthint_facets_" + fc + "_facet_" + c + "_key", entry.getKey());
prop.put("boosthint_facets_" + fc + "_facet_" + c + "_count", entry.getValue());
c++;
}
prop.put("boosthint_facets_" + fc + "_facet", c);
fc++;
}
} catch (IOException e) {
}
prop.put("boosthint_facets", fc);
}
return prop;
}
}