mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added display of author navigation (usage of that navigator not yet implemented
- added a synchronization in pdf parser which should help to avoid deadlocks that occur when displaying several search results pointing to pdf sources - fixed smaller bugs in navigation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6036 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
c879783008
commit
a9a8b8d161
|
@ -1,11 +1,8 @@
|
|||
#(nav-domains)#
|
||||
::
|
||||
#(nav-domains)#::
|
||||
<h3 style="padding-left:25px;">Domains</h3>
|
||||
<div>
|
||||
<ul style="padding-left: 0px;">#{element}#
|
||||
<div><ul style="padding-left: 0px;">#{element}#
|
||||
<li>#[url]#</li>
|
||||
#{/element}#
|
||||
</ul></div>
|
||||
#{/element}#</ul></div>
|
||||
#(/nav-domains)#
|
||||
#(nav-topics)#::
|
||||
<h3 style="padding-left:25px;">Topics</h3>
|
||||
|
@ -13,6 +10,12 @@
|
|||
<li>#[url]#</li>
|
||||
#{/element}#</ul></div>
|
||||
#(/nav-topics)#
|
||||
#(nav-authors)#::
|
||||
<h3 style="padding-left:25px;">Authors</h3>
|
||||
<div><ul style="padding-left: 0px;">#{element}#
|
||||
<li>#[url]#</li>
|
||||
#{/element}#</ul></div>
|
||||
#(/nav-authors)#
|
||||
|
||||
<h3 style="padding-left:25px;">Timeline</h3>
|
||||
<div>
|
||||
|
|
|
@ -55,8 +55,9 @@ public class yacysearchtrailer {
|
|||
}
|
||||
final plasmaSearchQuery theQuery = theSearch.getQuery();
|
||||
|
||||
|
||||
// compose search navigation
|
||||
|
||||
// host navigators
|
||||
ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
|
||||
if (hostNavigator == null) {
|
||||
prop.put("nav-domains", 0);
|
||||
|
@ -66,10 +67,10 @@ public class yacysearchtrailer {
|
|||
int i;
|
||||
for (i = 0; i < hostNavigator.size(); i++) {
|
||||
entry = hostNavigator.get(i);
|
||||
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
||||
prop.putJSON("nav_domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
|
||||
prop.put("nav-domains_element_" + i + "_name", entry.name);
|
||||
prop.put("nav-domains_element_" + i + "_count", entry.count);
|
||||
prop.put("nav-domains_element_" + i + "_name", entry.name);
|
||||
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
||||
prop.putJSON("nav-domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
|
||||
prop.put("nav-domains_element_" + i + "_count", entry.count);
|
||||
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
|
||||
prop.put("nav-domains_element_" + i + "_nl", 1);
|
||||
}
|
||||
|
@ -95,11 +96,7 @@ public class yacysearchtrailer {
|
|||
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
|
||||
prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
|
||||
prop.put("nav-topics_element_" + i + "_count", e.count);
|
||||
prop.put("nav-topics_element_" + i + "_offset", "0");
|
||||
prop.put("nav-topics_element_" + i + "_display", display);
|
||||
prop.put("nav-topics_element_" + i + "_modifier", e.name);
|
||||
prop.put("nav-topics_element_" + i + "_contentdom", theQuery.contentdom());
|
||||
prop.put("nav-topics_element_" + i + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
|
||||
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);
|
||||
}
|
||||
if (i++ > MAX_TOPWORDS) {
|
||||
|
@ -109,6 +106,29 @@ public class yacysearchtrailer {
|
|||
prop.put("nav-topics_element", i);
|
||||
prop.put("nav-topics", "1");
|
||||
}
|
||||
|
||||
// author navigators
|
||||
ArrayList<NavigatorEntry> authorNavigator = theSearch.getAuthorNavigator(10);
|
||||
if (authorNavigator == null) {
|
||||
prop.put("nav-authors", 0);
|
||||
} else {
|
||||
prop.put("nav-authors", 1);
|
||||
NavigatorEntry entry;
|
||||
int i;
|
||||
for (i = 0; i < authorNavigator.size(); i++) {
|
||||
entry = authorNavigator.get(i);
|
||||
prop.put("nav-authors_element_" + i + "_name", entry.name);
|
||||
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
||||
prop.putJSON("nav-authors_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators));
|
||||
prop.put("nav-authors_element_" + i + "_count", entry.count);
|
||||
prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
|
||||
prop.put("nav-authors_element_" + i + "_nl", 1);
|
||||
}
|
||||
i--;
|
||||
prop.put("nav-authors_element_" + i + "_nl", 0);
|
||||
prop.put("nav-authors_element", authorNavigator.size());
|
||||
}
|
||||
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false);
|
||||
|
||||
return prop;
|
||||
|
|
|
@ -99,6 +99,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
checkInterruption();
|
||||
|
||||
// creating a text stripper
|
||||
synchronized (SUPPORTED_MIME_TYPES) {
|
||||
final PDFTextStripper stripper = new PDFTextStripper();
|
||||
theDocument = parser.getPDDocument();
|
||||
|
||||
|
@ -168,6 +169,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
return theDoc;
|
||||
}
|
||||
}
|
||||
catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
|
|
|
@ -586,6 +586,11 @@ public final class plasmaSearchEvent {
|
|||
return this.rankedCache.getTopicNavigator(maxentries);
|
||||
}
|
||||
|
||||
public ArrayList<NavigatorEntry> getAuthorNavigator(final int maxentries) {
|
||||
// returns a list of authors so far seen on result set
|
||||
return this.rankedCache.getAuthorNavigator(maxentries);
|
||||
}
|
||||
|
||||
public ResultEntry oneResult(final int item) {
|
||||
// check if we already retrieved this item (happens if a search
|
||||
// pages is accessed a second time)
|
||||
|
|
|
@ -35,11 +35,13 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.kelondro.index.BinSearch;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
|
@ -76,8 +78,9 @@ public final class plasmaSearchRankingProcess {
|
|||
private final Segment indexSegment;
|
||||
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
||||
private final int[] domZones;
|
||||
private final ConcurrentHashMap<String, hoststat> hostNavigator;
|
||||
private final ConcurrentHashMap<String, HostInfo> hostNavigator;
|
||||
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
|
||||
private final TreeMap<byte[], AuthorInfo> authorNavigator;
|
||||
|
||||
public plasmaSearchRankingProcess(
|
||||
final Segment indexSegment,
|
||||
|
@ -103,7 +106,8 @@ public final class plasmaSearchRankingProcess {
|
|||
this.indexSegment = indexSegment;
|
||||
this.flagcount = new int[32];
|
||||
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
||||
this.hostNavigator = new ConcurrentHashMap<String, hoststat>();
|
||||
this.hostNavigator = new ConcurrentHashMap<String, HostInfo>();
|
||||
this.authorNavigator = new TreeMap<byte[], AuthorInfo>(Base64Order.enhancedCoder);
|
||||
this.ref = new ConcurrentHashMap<String, Integer>();
|
||||
this.domZones = new int[8];
|
||||
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
||||
|
@ -160,7 +164,7 @@ public final class plasmaSearchRankingProcess {
|
|||
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
||||
WordReferenceVars iEntry;
|
||||
Long r;
|
||||
hoststat hs;
|
||||
HostInfo hs;
|
||||
String domhash;
|
||||
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
|
||||
while (i.hasNext()) {
|
||||
|
@ -208,7 +212,7 @@ public final class plasmaSearchRankingProcess {
|
|||
domhash = iEntry.urlHash.substring(6);
|
||||
hs = this.hostNavigator.get(domhash);
|
||||
if (hs == null) {
|
||||
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
|
||||
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
|
||||
} else {
|
||||
hs.inc();
|
||||
}
|
||||
|
@ -321,6 +325,24 @@ public final class plasmaSearchRankingProcess {
|
|||
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
|
||||
if (u != null) {
|
||||
final URLMetadataRow.Components metadata = u.metadata();
|
||||
|
||||
// evaluate information of metadata for navigation
|
||||
// author navigation:
|
||||
String author = metadata.dc_creator();
|
||||
if (author != null && author.length() > 0) {
|
||||
byte[] authorhash = Word.word2hash(author);
|
||||
//synchronized (this.authorNavigator) {
|
||||
AuthorInfo in = this.authorNavigator.get(authorhash);
|
||||
if (in == null) {
|
||||
this.authorNavigator.put(authorhash, new AuthorInfo(author));
|
||||
} else {
|
||||
in.inc();
|
||||
this.authorNavigator.put(authorhash, in);
|
||||
}
|
||||
//}
|
||||
}
|
||||
|
||||
// get the url
|
||||
if (metadata.url() != null) {
|
||||
String urlstring = metadata.url().toNormalform(true, true);
|
||||
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
|
||||
|
@ -383,10 +405,10 @@ public final class plasmaSearchRankingProcess {
|
|||
return this.misses.iterator();
|
||||
}
|
||||
|
||||
public class hoststat {
|
||||
public class HostInfo {
|
||||
public int count;
|
||||
public String hashsample;
|
||||
public hoststat(String urlhash) {
|
||||
public HostInfo(String urlhash) {
|
||||
this.count = 1;
|
||||
this.hashsample = urlhash;
|
||||
}
|
||||
|
@ -395,8 +417,28 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
}
|
||||
|
||||
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
|
||||
public int compare(hoststat o1, hoststat o2) {
|
||||
public class AuthorInfo {
|
||||
public int count;
|
||||
public String author;
|
||||
public AuthorInfo(String author) {
|
||||
this.count = 1;
|
||||
this.author = author;
|
||||
}
|
||||
public void inc() {
|
||||
this.count++;
|
||||
}
|
||||
}
|
||||
|
||||
public static final Comparator<HostInfo> hscomp = new Comparator<HostInfo>() {
|
||||
public int compare(HostInfo o1, HostInfo o2) {
|
||||
if (o1.count < o2.count) return 1;
|
||||
if (o2.count < o1.count) return -1;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
public static final Comparator<AuthorInfo> aicomp = new Comparator<AuthorInfo>() {
|
||||
public int compare(AuthorInfo o1, AuthorInfo o2) {
|
||||
if (o1.count < o2.count) return 1;
|
||||
if (o2.count < o1.count) return -1;
|
||||
return 0;
|
||||
|
@ -415,7 +457,7 @@ public final class plasmaSearchRankingProcess {
|
|||
public ArrayList<NavigatorEntry> getHostNavigator(int count) {
|
||||
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList<NavigatorEntry>(0);
|
||||
|
||||
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
|
||||
HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]);
|
||||
Arrays.sort(hsa, hscomp);
|
||||
int rc = Math.min(count, hsa.length);
|
||||
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
|
||||
|
@ -488,6 +530,24 @@ public final class plasmaSearchRankingProcess {
|
|||
addTopic(descrcomps);
|
||||
}
|
||||
|
||||
public ArrayList<NavigatorEntry> getAuthorNavigator(final int count) {
|
||||
// create a list of words that had been computed by statistics over all
|
||||
// words that appeared in the url or the description of all urls
|
||||
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList<NavigatorEntry>(0);
|
||||
|
||||
AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]);
|
||||
Arrays.sort(a, aicomp);
|
||||
int rc = Math.min(count, a.length);
|
||||
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
|
||||
AuthorInfo e;
|
||||
for (int i = 0; i < rc; i++) {
|
||||
e = a[i];
|
||||
//System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count);
|
||||
result.add(new NavigatorEntry(e.author, e.count));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public ReferenceOrder getOrder() {
|
||||
return this.order;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user