- added display of author navigation (usage of that navigator not yet implemented

- added a synchronization in pdf parser which should help to avoid deadlocks that occur when displaying several search results pointing to pdf sources
- fixed smaller bugs in navigation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6036 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-06-08 22:01:26 +00:00
parent c879783008
commit a9a8b8d161
5 changed files with 114 additions and 24 deletions

View File

@ -1,11 +1,8 @@
#(nav-domains)#
::
#(nav-domains)#::
<h3 style="padding-left:25px;">Domains</h3>
<div>
<ul style="padding-left: 0px;">#{element}#
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#
</ul></div>
#{/element}#</ul></div>
#(/nav-domains)#
#(nav-topics)#::
<h3 style="padding-left:25px;">Topics</h3>
@ -13,6 +10,12 @@
<li>#[url]#</li>
#{/element}#</ul></div>
#(/nav-topics)#
#(nav-authors)#::
<h3 style="padding-left:25px;">Authors</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
#(/nav-authors)#
<h3 style="padding-left:25px;">Timeline</h3>
<div>

View File

@ -55,8 +55,9 @@ public class yacysearchtrailer {
}
final plasmaSearchQuery theQuery = theSearch.getQuery();
// compose search navigation
// host navigators
ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
if (hostNavigator == null) {
prop.put("nav-domains", 0);
@ -66,10 +67,10 @@ public class yacysearchtrailer {
int i;
for (i = 0; i < hostNavigator.size(); i++) {
entry = hostNavigator.get(i);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav_domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
prop.put("nav-domains_element_" + i + "_name", entry.name);
prop.put("nav-domains_element_" + i + "_count", entry.count);
prop.put("nav-domains_element_" + i + "_name", entry.name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
prop.put("nav-domains_element_" + i + "_count", entry.count);
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
prop.put("nav-domains_element_" + i + "_nl", 1);
}
@ -95,11 +96,7 @@ public class yacysearchtrailer {
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
prop.put("nav-topics_element_" + i + "_count", e.count);
prop.put("nav-topics_element_" + i + "_offset", "0");
prop.put("nav-topics_element_" + i + "_display", display);
prop.put("nav-topics_element_" + i + "_modifier", e.name);
prop.put("nav-topics_element_" + i + "_contentdom", theQuery.contentdom());
prop.put("nav-topics_element_" + i + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);
}
if (i++ > MAX_TOPWORDS) {
@ -109,6 +106,29 @@ public class yacysearchtrailer {
prop.put("nav-topics_element", i);
prop.put("nav-topics", "1");
}
// author navigators
ArrayList<NavigatorEntry> authorNavigator = theSearch.getAuthorNavigator(10);
if (authorNavigator == null) {
prop.put("nav-authors", 0);
} else {
prop.put("nav-authors", 1);
NavigatorEntry entry;
int i;
for (i = 0; i < authorNavigator.size(); i++) {
entry = authorNavigator.get(i);
prop.put("nav-authors_element_" + i + "_name", entry.name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators));
prop.put("nav-authors_element_" + i + "_count", entry.count);
prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
prop.put("nav-authors_element_" + i + "_nl", 1);
}
i--;
prop.put("nav-authors_element_" + i + "_nl", 0);
prop.put("nav-authors_element", authorNavigator.size());
}
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false);
return prop;

View File

@ -99,6 +99,7 @@ public class pdfParser extends AbstractParser implements Parser {
checkInterruption();
// creating a text stripper
synchronized (SUPPORTED_MIME_TYPES) {
final PDFTextStripper stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
@ -168,6 +169,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
return theDoc;
}
}
catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -586,6 +586,11 @@ public final class plasmaSearchEvent {
return this.rankedCache.getTopicNavigator(maxentries);
}
public ArrayList<NavigatorEntry> getAuthorNavigator(final int maxentries) {
// returns a list of authors so far seen on result set
return this.rankedCache.getAuthorNavigator(maxentries);
}
public ResultEntry oneResult(final int item) {
// check if we already retrieved this item (happens if a search
// pages is accessed a second time)

View File

@ -35,11 +35,13 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.index.BinSearch;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
@ -76,8 +78,9 @@ public final class plasmaSearchRankingProcess {
private final Segment indexSegment;
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private final int[] domZones;
private final ConcurrentHashMap<String, hoststat> hostNavigator;
private final ConcurrentHashMap<String, HostInfo> hostNavigator;
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
private final TreeMap<byte[], AuthorInfo> authorNavigator;
public plasmaSearchRankingProcess(
final Segment indexSegment,
@ -103,7 +106,8 @@ public final class plasmaSearchRankingProcess {
this.indexSegment = indexSegment;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.hostNavigator = new ConcurrentHashMap<String, hoststat>();
this.hostNavigator = new ConcurrentHashMap<String, HostInfo>();
this.authorNavigator = new TreeMap<byte[], AuthorInfo>(Base64Order.enhancedCoder);
this.ref = new ConcurrentHashMap<String, Integer>();
this.domZones = new int[8];
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
@ -160,7 +164,7 @@ public final class plasmaSearchRankingProcess {
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r;
hoststat hs;
HostInfo hs;
String domhash;
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
while (i.hasNext()) {
@ -208,7 +212,7 @@ public final class plasmaSearchRankingProcess {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
@ -321,6 +325,24 @@ public final class plasmaSearchRankingProcess {
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadataRow.Components metadata = u.metadata();
// evaluate information of metadata for navigation
// author navigation:
String author = metadata.dc_creator();
if (author != null && author.length() > 0) {
byte[] authorhash = Word.word2hash(author);
//synchronized (this.authorNavigator) {
AuthorInfo in = this.authorNavigator.get(authorhash);
if (in == null) {
this.authorNavigator.put(authorhash, new AuthorInfo(author));
} else {
in.inc();
this.authorNavigator.put(authorhash, in);
}
//}
}
// get the url
if (metadata.url() != null) {
String urlstring = metadata.url().toNormalform(true, true);
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
@ -383,10 +405,10 @@ public final class plasmaSearchRankingProcess {
return this.misses.iterator();
}
public class hoststat {
public class HostInfo {
public int count;
public String hashsample;
public hoststat(String urlhash) {
public HostInfo(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
@ -395,8 +417,28 @@ public final class plasmaSearchRankingProcess {
}
}
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
public class AuthorInfo {
public int count;
public String author;
public AuthorInfo(String author) {
this.count = 1;
this.author = author;
}
public void inc() {
this.count++;
}
}
public static final Comparator<HostInfo> hscomp = new Comparator<HostInfo>() {
public int compare(HostInfo o1, HostInfo o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};
public static final Comparator<AuthorInfo> aicomp = new Comparator<AuthorInfo>() {
public int compare(AuthorInfo o1, AuthorInfo o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
@ -415,7 +457,7 @@ public final class plasmaSearchRankingProcess {
public ArrayList<NavigatorEntry> getHostNavigator(int count) {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList<NavigatorEntry>(0);
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(count, hsa.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
@ -488,6 +530,24 @@ public final class plasmaSearchRankingProcess {
addTopic(descrcomps);
}
public ArrayList<NavigatorEntry> getAuthorNavigator(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList<NavigatorEntry>(0);
AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]);
Arrays.sort(a, aicomp);
int rc = Math.min(count, a.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
AuthorInfo e;
for (int i = 0; i < rc; i++) {
e = a[i];
//System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count);
result.add(new NavigatorEntry(e.author, e.count));
}
return result;
}
public ReferenceOrder getOrder() {
return this.order;
}