From a5707cd2eb7faa969e1db1e49598b43f83f08298 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 27 Jun 2014 23:05:06 +0200 Subject: [PATCH 1/2] enable proper Author navigator - author facet is based on omitted author_sxt field - adjust to make author nav available on exist of author field but keep using author_sxt to construct the facet (why!?) - add check for querymodifier author in searchevent --- source/net/yacy/search/query/QueryParams.java | 20 +++++++++---------- source/net/yacy/search/query/SearchEvent.java | 14 +++++++++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 17267b489..c5dd35a5e 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -35,11 +35,6 @@ import java.util.Set; import java.util.SortedSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; - -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrQuery.SortClause; -import org.apache.solr.common.params.FacetParams; - import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; @@ -65,6 +60,10 @@ import net.yacy.search.index.Segment; import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrQuery.SortClause; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.FacetParams; public final class QueryParams { @@ -227,7 +226,8 @@ public final class QueryParams { this.solrSchema = indexSegment.fulltext().getDefaultConfiguration(); for (String navkey: search_navigation) { CollectionSchema f = defaultfacetfields.get(navkey); - if (f != null && solrSchema.contains(f)) this.facetfields.add(f.getSolrFieldName()); + // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) + if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName()); } for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); this.maxfacets = defaultmaxfacets; @@ -358,8 +358,8 @@ public final class QueryParams { bq += CollectionSchema.text_t.getSolrFieldName() + ":\"" + this.queryGoal.getIncludeString() + "\"^10"; } if (fq.length() > 0) { - String oldfq = params.get("fq"); - params.setParam("fq", oldfq == null || oldfq.length() == 0 ? fq : "(" + oldfq + ") AND (" + fq + ")"); + String oldfq = params.get(CommonParams.FQ); + params.setParam(CommonParams.FQ, oldfq == null || oldfq.length() == 0 ? fq : "(" + oldfq + ") AND (" + fq + ")"); } if (bq.length() > 0) params.setParam("bq", bq); if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29 @@ -465,8 +465,8 @@ public final class QueryParams { fq.append(" AND ").append(CollectionSchema.language_s.getSolrFieldName()).append(":\"").append(this.modifier.language).append('\"'); } - // add author facets - if (this.modifier.author != null && this.modifier.author.length() > 0 && this.solrSchema.contains(CollectionSchema.author_sxt)) { + // add author facets (check for contains(author) as author_sxt is omitted copyfield) + if (this.modifier.author != null && this.modifier.author.length() > 0 && this.solrSchema.contains(CollectionSchema.author)) { fq.append(" AND ").append(CollectionSchema.author_sxt.getSolrFieldName()).append(":\"").append(this.modifier.author).append('\"'); } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 849cf47a9..2a98b692d 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -911,6 +911,13 @@ public final class SearchEvent { continue pollloop; } } + + if (this.query.modifier.author != null) { + if (!this.query.modifier.author.equals(iEntry.dc_creator())) { + if (log.isFine()) log.fine ("dropped Node: author"); + continue pollloop; + } + } // finally extend the double-check and insert result to stack this.urlhashes.putUnique(iEntry.hash()); rankingtryloop: while (true) { @@ -1098,6 +1105,13 @@ public final class SearchEvent { continue; } + // check modifier constraint (author) + if (this.query.modifier.author != null && !page.dc_creator().toLowerCase().contains(this.query.modifier.author.toLowerCase()) /*!this.query.modifier.author.equalsIgnoreCase(page.dc_creator())*/) { + if (log.isFine()) log.fine("dropped RWI: author constraint = " + this.query.modifier.author); + if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); + continue; + } + // Check for blacklist if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page.url())) { if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist"); From cb2c17d236188693f99e8a8e0e8df6fdaee57b1f Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 29 Jun 2014 02:54:09 +0200 Subject: [PATCH 2/2] extract author and keywords in .doc and .ppt parser --- source/net/yacy/document/parser/docParser.java | 12 ++++++++++-- source/net/yacy/document/parser/pptParser.java | 10 ++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 8c0263ac3..a09f9e391 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -86,6 +86,14 @@ public class docParser extends AbstractParser implements Parser { if (title.length() == l) break; l = title.length(); } + // get keywords (for yacy as array) + final String keywords = extractor.getSummaryInformation().getKeywords(); + final String[] keywlist; + if (keywords != null && !keywords.isEmpty()) { + keywlist = keywords.split(","); + } else { + keywlist = null; + } Document[] docs; docs = new Document[]{new Document( @@ -94,9 +102,9 @@ public class docParser extends AbstractParser implements Parser { "UTF-8", this, null, - null, + keywlist, singleList(title), - "", // TODO: AUTHOR + extractor.getSummaryInformation().getAuthor(), // constuctor can handle null extractor.getDocSummaryInformation().getCompany(), // publisher null, null, diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index f21f188bf..e0773f2ba 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -78,6 +78,12 @@ public class pptParser extends AbstractParser implements Parser { if (title.length() == l) break; l = title.length(); } + // get keywords (for yacy as array) + final String keywords = pptExtractor.getSummaryInformation().getKeywords(); + final String[] keywlist; + if (keywords != null && !keywords.isEmpty()) { + keywlist = keywords.split(","); + } else keywlist = null; /* * create the plasmaParserDocument for the database @@ -89,9 +95,9 @@ public class pptParser extends AbstractParser implements Parser { "UTF-8", this, null, - null, + keywlist, singleList(title), - "", // TODO: AUTHOR + pptExtractor.getSummaryInformation().getAuthor(), // may be null pptExtractor.getDocSummaryInformation().getCompany(), null, null,