mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added display of author navigation (usage of that navigator not yet implemented
- added a synchronization in pdf parser which should help to avoid deadlocks that occur when displaying several search results pointing to pdf sources - fixed smaller bugs in navigation git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6036 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
c879783008
commit
a9a8b8d161
|
@ -1,11 +1,8 @@
|
||||||
#(nav-domains)#
|
#(nav-domains)#::
|
||||||
::
|
|
||||||
<h3 style="padding-left:25px;">Domains</h3>
|
<h3 style="padding-left:25px;">Domains</h3>
|
||||||
<div>
|
<div><ul style="padding-left: 0px;">#{element}#
|
||||||
<ul style="padding-left: 0px;">#{element}#
|
|
||||||
<li>#[url]#</li>
|
<li>#[url]#</li>
|
||||||
#{/element}#
|
#{/element}#</ul></div>
|
||||||
</ul></div>
|
|
||||||
#(/nav-domains)#
|
#(/nav-domains)#
|
||||||
#(nav-topics)#::
|
#(nav-topics)#::
|
||||||
<h3 style="padding-left:25px;">Topics</h3>
|
<h3 style="padding-left:25px;">Topics</h3>
|
||||||
|
@ -13,6 +10,12 @@
|
||||||
<li>#[url]#</li>
|
<li>#[url]#</li>
|
||||||
#{/element}#</ul></div>
|
#{/element}#</ul></div>
|
||||||
#(/nav-topics)#
|
#(/nav-topics)#
|
||||||
|
#(nav-authors)#::
|
||||||
|
<h3 style="padding-left:25px;">Authors</h3>
|
||||||
|
<div><ul style="padding-left: 0px;">#{element}#
|
||||||
|
<li>#[url]#</li>
|
||||||
|
#{/element}#</ul></div>
|
||||||
|
#(/nav-authors)#
|
||||||
|
|
||||||
<h3 style="padding-left:25px;">Timeline</h3>
|
<h3 style="padding-left:25px;">Timeline</h3>
|
||||||
<div>
|
<div>
|
||||||
|
|
|
@ -55,8 +55,9 @@ public class yacysearchtrailer {
|
||||||
}
|
}
|
||||||
final plasmaSearchQuery theQuery = theSearch.getQuery();
|
final plasmaSearchQuery theQuery = theSearch.getQuery();
|
||||||
|
|
||||||
|
|
||||||
// compose search navigation
|
// compose search navigation
|
||||||
|
|
||||||
|
// host navigators
|
||||||
ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
|
ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
|
||||||
if (hostNavigator == null) {
|
if (hostNavigator == null) {
|
||||||
prop.put("nav-domains", 0);
|
prop.put("nav-domains", 0);
|
||||||
|
@ -66,10 +67,10 @@ public class yacysearchtrailer {
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < hostNavigator.size(); i++) {
|
for (i = 0; i < hostNavigator.size(); i++) {
|
||||||
entry = hostNavigator.get(i);
|
entry = hostNavigator.get(i);
|
||||||
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
prop.put("nav-domains_element_" + i + "_name", entry.name);
|
||||||
prop.putJSON("nav_domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
|
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
||||||
prop.put("nav-domains_element_" + i + "_name", entry.name);
|
prop.putJSON("nav-domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
|
||||||
prop.put("nav-domains_element_" + i + "_count", entry.count);
|
prop.put("nav-domains_element_" + i + "_count", entry.count);
|
||||||
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
|
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
|
||||||
prop.put("nav-domains_element_" + i + "_nl", 1);
|
prop.put("nav-domains_element_" + i + "_nl", 1);
|
||||||
}
|
}
|
||||||
|
@ -95,11 +96,7 @@ public class yacysearchtrailer {
|
||||||
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
|
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
|
||||||
prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
|
prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
|
||||||
prop.put("nav-topics_element_" + i + "_count", e.count);
|
prop.put("nav-topics_element_" + i + "_count", e.count);
|
||||||
prop.put("nav-topics_element_" + i + "_offset", "0");
|
|
||||||
prop.put("nav-topics_element_" + i + "_display", display);
|
|
||||||
prop.put("nav-topics_element_" + i + "_modifier", e.name);
|
prop.put("nav-topics_element_" + i + "_modifier", e.name);
|
||||||
prop.put("nav-topics_element_" + i + "_contentdom", theQuery.contentdom());
|
|
||||||
prop.put("nav-topics_element_" + i + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
|
|
||||||
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);
|
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);
|
||||||
}
|
}
|
||||||
if (i++ > MAX_TOPWORDS) {
|
if (i++ > MAX_TOPWORDS) {
|
||||||
|
@ -109,6 +106,29 @@ public class yacysearchtrailer {
|
||||||
prop.put("nav-topics_element", i);
|
prop.put("nav-topics_element", i);
|
||||||
prop.put("nav-topics", "1");
|
prop.put("nav-topics", "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// author navigators
|
||||||
|
ArrayList<NavigatorEntry> authorNavigator = theSearch.getAuthorNavigator(10);
|
||||||
|
if (authorNavigator == null) {
|
||||||
|
prop.put("nav-authors", 0);
|
||||||
|
} else {
|
||||||
|
prop.put("nav-authors", 1);
|
||||||
|
NavigatorEntry entry;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < authorNavigator.size(); i++) {
|
||||||
|
entry = authorNavigator.get(i);
|
||||||
|
prop.put("nav-authors_element_" + i + "_name", entry.name);
|
||||||
|
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + plasmaSearchQuery.navurl("html", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
|
||||||
|
prop.putJSON("nav-authors_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators));
|
||||||
|
prop.put("nav-authors_element_" + i + "_count", entry.count);
|
||||||
|
prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
|
||||||
|
prop.put("nav-authors_element_" + i + "_nl", 1);
|
||||||
|
}
|
||||||
|
i--;
|
||||||
|
prop.put("nav-authors_element_" + i + "_nl", 0);
|
||||||
|
prop.put("nav-authors_element", authorNavigator.size());
|
||||||
|
}
|
||||||
|
|
||||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false);
|
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false);
|
||||||
|
|
||||||
return prop;
|
return prop;
|
||||||
|
|
|
@ -99,6 +99,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
||||||
checkInterruption();
|
checkInterruption();
|
||||||
|
|
||||||
// creating a text stripper
|
// creating a text stripper
|
||||||
|
synchronized (SUPPORTED_MIME_TYPES) {
|
||||||
final PDFTextStripper stripper = new PDFTextStripper();
|
final PDFTextStripper stripper = new PDFTextStripper();
|
||||||
theDocument = parser.getPDDocument();
|
theDocument = parser.getPDDocument();
|
||||||
|
|
||||||
|
@ -168,6 +169,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
||||||
}
|
}
|
||||||
|
|
||||||
return theDoc;
|
return theDoc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (final Exception e) {
|
catch (final Exception e) {
|
||||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||||
|
|
|
@ -586,6 +586,11 @@ public final class plasmaSearchEvent {
|
||||||
return this.rankedCache.getTopicNavigator(maxentries);
|
return this.rankedCache.getTopicNavigator(maxentries);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ArrayList<NavigatorEntry> getAuthorNavigator(final int maxentries) {
|
||||||
|
// returns a list of authors so far seen on result set
|
||||||
|
return this.rankedCache.getAuthorNavigator(maxentries);
|
||||||
|
}
|
||||||
|
|
||||||
public ResultEntry oneResult(final int item) {
|
public ResultEntry oneResult(final int item) {
|
||||||
// check if we already retrieved this item (happens if a search
|
// check if we already retrieved this item (happens if a search
|
||||||
// pages is accessed a second time)
|
// pages is accessed a second time)
|
||||||
|
|
|
@ -35,11 +35,13 @@ import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.TreeMap;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||||
import de.anomic.kelondro.index.BinSearch;
|
import de.anomic.kelondro.index.BinSearch;
|
||||||
|
import de.anomic.kelondro.order.Base64Order;
|
||||||
import de.anomic.kelondro.order.Digest;
|
import de.anomic.kelondro.order.Digest;
|
||||||
import de.anomic.kelondro.text.Reference;
|
import de.anomic.kelondro.text.Reference;
|
||||||
import de.anomic.kelondro.text.ReferenceContainer;
|
import de.anomic.kelondro.text.ReferenceContainer;
|
||||||
|
@ -76,8 +78,9 @@ public final class plasmaSearchRankingProcess {
|
||||||
private final Segment indexSegment;
|
private final Segment indexSegment;
|
||||||
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
||||||
private final int[] domZones;
|
private final int[] domZones;
|
||||||
private final ConcurrentHashMap<String, hoststat> hostNavigator;
|
private final ConcurrentHashMap<String, HostInfo> hostNavigator;
|
||||||
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
|
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
|
||||||
|
private final TreeMap<byte[], AuthorInfo> authorNavigator;
|
||||||
|
|
||||||
public plasmaSearchRankingProcess(
|
public plasmaSearchRankingProcess(
|
||||||
final Segment indexSegment,
|
final Segment indexSegment,
|
||||||
|
@ -103,7 +106,8 @@ public final class plasmaSearchRankingProcess {
|
||||||
this.indexSegment = indexSegment;
|
this.indexSegment = indexSegment;
|
||||||
this.flagcount = new int[32];
|
this.flagcount = new int[32];
|
||||||
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
||||||
this.hostNavigator = new ConcurrentHashMap<String, hoststat>();
|
this.hostNavigator = new ConcurrentHashMap<String, HostInfo>();
|
||||||
|
this.authorNavigator = new TreeMap<byte[], AuthorInfo>(Base64Order.enhancedCoder);
|
||||||
this.ref = new ConcurrentHashMap<String, Integer>();
|
this.ref = new ConcurrentHashMap<String, Integer>();
|
||||||
this.domZones = new int[8];
|
this.domZones = new int[8];
|
||||||
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
||||||
|
@ -160,7 +164,7 @@ public final class plasmaSearchRankingProcess {
|
||||||
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
||||||
WordReferenceVars iEntry;
|
WordReferenceVars iEntry;
|
||||||
Long r;
|
Long r;
|
||||||
hoststat hs;
|
HostInfo hs;
|
||||||
String domhash;
|
String domhash;
|
||||||
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
|
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
|
@ -208,7 +212,7 @@ public final class plasmaSearchRankingProcess {
|
||||||
domhash = iEntry.urlHash.substring(6);
|
domhash = iEntry.urlHash.substring(6);
|
||||||
hs = this.hostNavigator.get(domhash);
|
hs = this.hostNavigator.get(domhash);
|
||||||
if (hs == null) {
|
if (hs == null) {
|
||||||
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
|
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
|
||||||
} else {
|
} else {
|
||||||
hs.inc();
|
hs.inc();
|
||||||
}
|
}
|
||||||
|
@ -321,6 +325,24 @@ public final class plasmaSearchRankingProcess {
|
||||||
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
|
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
|
||||||
if (u != null) {
|
if (u != null) {
|
||||||
final URLMetadataRow.Components metadata = u.metadata();
|
final URLMetadataRow.Components metadata = u.metadata();
|
||||||
|
|
||||||
|
// evaluate information of metadata for navigation
|
||||||
|
// author navigation:
|
||||||
|
String author = metadata.dc_creator();
|
||||||
|
if (author != null && author.length() > 0) {
|
||||||
|
byte[] authorhash = Word.word2hash(author);
|
||||||
|
//synchronized (this.authorNavigator) {
|
||||||
|
AuthorInfo in = this.authorNavigator.get(authorhash);
|
||||||
|
if (in == null) {
|
||||||
|
this.authorNavigator.put(authorhash, new AuthorInfo(author));
|
||||||
|
} else {
|
||||||
|
in.inc();
|
||||||
|
this.authorNavigator.put(authorhash, in);
|
||||||
|
}
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the url
|
||||||
if (metadata.url() != null) {
|
if (metadata.url() != null) {
|
||||||
String urlstring = metadata.url().toNormalform(true, true);
|
String urlstring = metadata.url().toNormalform(true, true);
|
||||||
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
|
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
|
||||||
|
@ -383,10 +405,10 @@ public final class plasmaSearchRankingProcess {
|
||||||
return this.misses.iterator();
|
return this.misses.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
public class hoststat {
|
public class HostInfo {
|
||||||
public int count;
|
public int count;
|
||||||
public String hashsample;
|
public String hashsample;
|
||||||
public hoststat(String urlhash) {
|
public HostInfo(String urlhash) {
|
||||||
this.count = 1;
|
this.count = 1;
|
||||||
this.hashsample = urlhash;
|
this.hashsample = urlhash;
|
||||||
}
|
}
|
||||||
|
@ -395,8 +417,28 @@ public final class plasmaSearchRankingProcess {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
|
public class AuthorInfo {
|
||||||
public int compare(hoststat o1, hoststat o2) {
|
public int count;
|
||||||
|
public String author;
|
||||||
|
public AuthorInfo(String author) {
|
||||||
|
this.count = 1;
|
||||||
|
this.author = author;
|
||||||
|
}
|
||||||
|
public void inc() {
|
||||||
|
this.count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Comparator<HostInfo> hscomp = new Comparator<HostInfo>() {
|
||||||
|
public int compare(HostInfo o1, HostInfo o2) {
|
||||||
|
if (o1.count < o2.count) return 1;
|
||||||
|
if (o2.count < o1.count) return -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Comparator<AuthorInfo> aicomp = new Comparator<AuthorInfo>() {
|
||||||
|
public int compare(AuthorInfo o1, AuthorInfo o2) {
|
||||||
if (o1.count < o2.count) return 1;
|
if (o1.count < o2.count) return 1;
|
||||||
if (o2.count < o1.count) return -1;
|
if (o2.count < o1.count) return -1;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -415,7 +457,7 @@ public final class plasmaSearchRankingProcess {
|
||||||
public ArrayList<NavigatorEntry> getHostNavigator(int count) {
|
public ArrayList<NavigatorEntry> getHostNavigator(int count) {
|
||||||
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList<NavigatorEntry>(0);
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList<NavigatorEntry>(0);
|
||||||
|
|
||||||
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
|
HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]);
|
||||||
Arrays.sort(hsa, hscomp);
|
Arrays.sort(hsa, hscomp);
|
||||||
int rc = Math.min(count, hsa.length);
|
int rc = Math.min(count, hsa.length);
|
||||||
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
|
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
|
||||||
|
@ -488,6 +530,24 @@ public final class plasmaSearchRankingProcess {
|
||||||
addTopic(descrcomps);
|
addTopic(descrcomps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ArrayList<NavigatorEntry> getAuthorNavigator(final int count) {
|
||||||
|
// create a list of words that had been computed by statistics over all
|
||||||
|
// words that appeared in the url or the description of all urls
|
||||||
|
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList<NavigatorEntry>(0);
|
||||||
|
|
||||||
|
AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]);
|
||||||
|
Arrays.sort(a, aicomp);
|
||||||
|
int rc = Math.min(count, a.length);
|
||||||
|
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
|
||||||
|
AuthorInfo e;
|
||||||
|
for (int i = 0; i < rc; i++) {
|
||||||
|
e = a[i];
|
||||||
|
//System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count);
|
||||||
|
result.add(new NavigatorEntry(e.author, e.count));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
public ReferenceOrder getOrder() {
|
public ReferenceOrder getOrder() {
|
||||||
return this.order;
|
return this.order;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user