mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
first attempt to add 'real' Navigation to yacy search results: host navigation
- after a search is started, it is analysed how many hits are in each site - this can be done really efficient, because the navigation information is hidden in the url hash and can be computed very fast - the search result shows a column on the right with the hosts and the hits per host - after a click on a host the search is modified using the efficient site: - operator git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5976 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
54b9e99c01
commit
f246928c20
|
@ -120,17 +120,18 @@ document.getElementById("Enter").value = "search again";
|
|||
var progressbar = new Progressbar(#[results]#, document.getElementById("results"));
|
||||
</script>
|
||||
|
||||
#(display)#
|
||||
::
|
||||
::
|
||||
#(navigation)#
|
||||
::
|
||||
<div id="sidebar" style="float: right;">
|
||||
<h3><a href="#">Sidebar-1</a></h3>
|
||||
<p>Sidebar-1 TEXT TEXT</p>
|
||||
<h3><a href="#">Sidebar-2</a></h3>
|
||||
<p>Sidebar-2 TEXT TEXT</p>
|
||||
<h3><a href="#">Navigation</a></h3>
|
||||
<h4><a href="#">Domains</a></h4>
|
||||
<ul>
|
||||
#{domains}#
|
||||
<li>#[domain]#</li>
|
||||
#{/domains}#
|
||||
</ul>
|
||||
</div>
|
||||
#(/display)#
|
||||
#(/navigation)#
|
||||
|
||||
<!-- linklist begin -->
|
||||
#(resultTable)#::<table width="100%"><tr class="TableHeader"><td width="30%">Media</td><td width="70%">URL</tr>#(/resultTable)#
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
// if the shell's current path is HTROOT
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
@ -47,6 +48,7 @@ import de.anomic.plasma.plasmaSwitchboard;
|
|||
import de.anomic.plasma.plasmaSwitchboardConstants;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverDomains;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -464,7 +466,7 @@ public class yacysearch {
|
|||
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask));
|
||||
resnav.append("<strong><</strong></a> ");
|
||||
*/
|
||||
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask));
|
||||
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask, null));
|
||||
resnav.append("<img src=\"env/grafics/navdl.gif\" width=\"16\" height=\"16\"></a> ");
|
||||
}
|
||||
final int numberofpages = Math.min(10, Math.max(thispage + 2, totalcount / theQuery.displayResults()));
|
||||
|
@ -484,7 +486,7 @@ public class yacysearch {
|
|||
resnav.append(i + 1);
|
||||
resnav.append("</a> ");
|
||||
*/
|
||||
resnav.append(navurla(i, display, theQuery, originalUrlMask));
|
||||
resnav.append(navurla(i, display, theQuery, originalUrlMask, null));
|
||||
resnav.append("<img src=\"env/grafics/navd");
|
||||
resnav.append(i + 1);
|
||||
resnav.append(".gif\" width=\"16\" height=\"16\"></a> ");
|
||||
|
@ -498,12 +500,26 @@ public class yacysearch {
|
|||
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask));
|
||||
resnav.append("<strong>></strong></a>");
|
||||
*/
|
||||
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask));
|
||||
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask, null));
|
||||
resnav.append("<img src=\"env/grafics/navdr.gif\" width=\"16\" height=\"16\"></a>");
|
||||
}
|
||||
prop.put("num-results_resnav", resnav.toString());
|
||||
|
||||
// compose search navigation
|
||||
ArrayList<hostnaventry> hostNavigator = theSearch.getHostNavigator(10);
|
||||
if (hostNavigator == null) {
|
||||
prop.put("navigation", 0);
|
||||
} else {
|
||||
prop.put("navigation", 1);
|
||||
hostnaventry entry;
|
||||
for (int i = 0; i < hostNavigator.size(); i++) {
|
||||
entry = hostNavigator.get(i);
|
||||
prop.put("navigation_domains_" + i + "_domain", navurla(thispage, display, theQuery, originalUrlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>");
|
||||
}
|
||||
prop.put("navigation_domains", hostNavigator.size());
|
||||
}
|
||||
|
||||
// generate the search result lines; they will be produced by another servlet
|
||||
// generate the search result lines; the content will be produced by another servlet
|
||||
for (int i = 0; i < theQuery.displayResults(); i++) {
|
||||
prop.put("results_" + i + "_item", offset + i);
|
||||
prop.put("results_" + i + "_eventID", theQuery.id(false));
|
||||
|
@ -573,10 +589,10 @@ public class yacysearch {
|
|||
/**
|
||||
* generates the page navigation bar
|
||||
*/
|
||||
private static String navurla(final int page, final int display, final plasmaSearchQuery theQuery, final String originalUrlMask) {
|
||||
private static String navurla(final int page, final int display, final plasmaSearchQuery theQuery, final String originalUrlMask, String addToQuery) {
|
||||
return
|
||||
"<a href=\"yacysearch.html?display=" + display +
|
||||
"&search=" + theQuery.queryString(true) +
|
||||
"&search=" + theQuery.queryString(true) + ((addToQuery == null) ? "" : "+" + addToQuery) +
|
||||
"&maximumRecords="+ theQuery.displayResults() +
|
||||
"&startRecord=" + (page * theQuery.displayResults()) +
|
||||
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
|
||||
|
|
|
@ -52,6 +52,7 @@ import de.anomic.kelondro.util.SortStore;
|
|||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.parser.Word;
|
||||
import de.anomic.plasma.parser.Condenser;
|
||||
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
|
||||
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
|
||||
import de.anomic.server.serverProfiling;
|
||||
import de.anomic.yacy.yacySearch;
|
||||
|
@ -94,7 +95,8 @@ public final class plasmaSearchEvent {
|
|||
TreeSet<byte[]> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
|
||||
long urlRetrievalAllTime;
|
||||
long snippetComputationAllTime;
|
||||
ResultURLs crawlResults;
|
||||
public ResultURLs crawlResults;
|
||||
public ArrayList<hostnaventry> hostNavigator;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private plasmaSearchEvent(final plasmaSearchQuery query,
|
||||
|
@ -135,6 +137,7 @@ public final class plasmaSearchEvent {
|
|||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
|
||||
// do a global search
|
||||
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, max_results_preparation, 16);
|
||||
this.hostNavigator = null;
|
||||
|
||||
final int fetchpeers = 12;
|
||||
|
||||
|
@ -171,6 +174,7 @@ public final class plasmaSearchEvent {
|
|||
// do a local search
|
||||
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, max_results_preparation, 2);
|
||||
this.rankedCache.execQuery();
|
||||
this.hostNavigator = rankedCache.getHostNavigator(10);
|
||||
//plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
|
||||
|
||||
if (generateAbstracts) {
|
||||
|
@ -230,6 +234,7 @@ public final class plasmaSearchEvent {
|
|||
// so following sortings together with the global results will be fast
|
||||
try {
|
||||
rankedCache.execQuery();
|
||||
hostNavigator = rankedCache.getHostNavigator(10);
|
||||
} catch (final Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -563,52 +568,16 @@ public final class plasmaSearchEvent {
|
|||
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
|
||||
}
|
||||
|
||||
/*
|
||||
public ResultEntry oneResult(final int item) {
|
||||
return oneResult(item, System.currentTimeMillis() + 100);
|
||||
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
|
||||
if (this.hostNavigator != null) return this.hostNavigator;
|
||||
if (localSearchThread != null && localSearchThread.isAlive()) {
|
||||
try {Thread.sleep(100L);} catch (final InterruptedException e) {}
|
||||
}
|
||||
this.hostNavigator = rankedCache.getHostNavigator(10);
|
||||
if (this.hostNavigator.size() == 0) this.hostNavigator = null;
|
||||
return this.hostNavigator;
|
||||
}
|
||||
|
||||
public ResultEntry oneResult(final int item, long timeout) {
|
||||
// check if we already retrieved this item (happens if a search pages is accessed a second time)
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "obtain one result entry - start", 0, 0));
|
||||
if (this.result.sizeStore() > item) {
|
||||
// we have the wanted result already in the result array .. return that
|
||||
return this.result.element(item).element;
|
||||
}
|
||||
|
||||
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
|
||||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
|
||||
// this is a search using remote search threads. Also the local search thread is started as background process
|
||||
while (
|
||||
localSearchThread != null &&
|
||||
localSearchThread.isAlive() &&
|
||||
System.currentTimeMillis() < timeout) {
|
||||
// in case that the local search takes longer than some other remote search requests,
|
||||
// do some sleeps to give the local process a chance to contribute
|
||||
try {Thread.sleep(10);} catch (final InterruptedException e) {}
|
||||
}
|
||||
// now wait until as many remote worker threads have finished, as we want to display results
|
||||
while (
|
||||
this.primarySearchThreads != null &&
|
||||
anyWorkerAlive() &&
|
||||
countWorkerFinished() <= item &&
|
||||
System.currentTimeMillis() < timeout &&
|
||||
(result.size() <= item || countFinishedRemoteSearch() <= item)) {
|
||||
try {Thread.sleep(10);} catch (final InterruptedException e) {}
|
||||
}
|
||||
|
||||
}
|
||||
// finally wait until enough results are there produced from the snippet fetch process
|
||||
while (anyWorkerAlive() && result.size() <= item) {
|
||||
try {Thread.sleep(10);} catch (final InterruptedException e) {}
|
||||
}
|
||||
|
||||
// finally, if there is something, return the result
|
||||
if (this.result.size() <= item) return null;
|
||||
return this.result.element(item).element;
|
||||
}
|
||||
*/
|
||||
|
||||
public ResultEntry oneResult(final int item) {
|
||||
// check if we already retrieved this item (happens if a search
|
||||
// pages is accessed a second time)
|
||||
|
|
|
@ -74,6 +74,7 @@ public final class plasmaSearchRankingProcess {
|
|||
private final plasmaWordIndex wordIndex;
|
||||
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
|
||||
private final int[] domZones;
|
||||
private HashMap<String, hoststat> hostNavigator;
|
||||
|
||||
public plasmaSearchRankingProcess(
|
||||
final plasmaWordIndex wordIndex,
|
||||
|
@ -101,6 +102,7 @@ public final class plasmaSearchRankingProcess {
|
|||
this.flagcount = new int[32];
|
||||
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
|
||||
this.domZones = new int[8];
|
||||
this.hostNavigator = new HashMap<String, hoststat>();
|
||||
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
|
||||
}
|
||||
|
||||
|
@ -158,6 +160,8 @@ public final class plasmaSearchRankingProcess {
|
|||
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
|
||||
WordReferenceVars iEntry;
|
||||
Long r;
|
||||
hoststat hs;
|
||||
String domhash;
|
||||
while (i.hasNext()) {
|
||||
iEntry = i.next();
|
||||
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
|
||||
|
@ -196,13 +200,17 @@ public final class plasmaSearchRankingProcess {
|
|||
}
|
||||
|
||||
// count domZones
|
||||
/*
|
||||
indexURLEntry uentry = wordIndex.loadedURL.load(iEntry.urlHash, iEntry, 0); // this eats up a lot of time!!!
|
||||
yacyURL uurl = (uentry == null) ? null : uentry.comp().url();
|
||||
System.out.println("DEBUG domDomain dom=" + ((uurl == null) ? "null" : uurl.getHost()) + ", zone=" + yacyURL.domDomain(iEntry.urlHash()));
|
||||
*/
|
||||
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
|
||||
|
||||
// get statistics for host navigator
|
||||
domhash = iEntry.urlHash.substring(6);
|
||||
hs = this.hostNavigator.get(domhash);
|
||||
if (hs == null) {
|
||||
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
|
||||
} else {
|
||||
hs.inc();
|
||||
}
|
||||
|
||||
// insert
|
||||
if ((maxentries < 0) || (stack.size() < maxentries)) {
|
||||
// in case that we don't have enough yet, accept any new entry
|
||||
|
@ -225,6 +233,51 @@ public final class plasmaSearchRankingProcess {
|
|||
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
|
||||
}
|
||||
|
||||
public class hoststat {
|
||||
public int count;
|
||||
public String hashsample;
|
||||
public hoststat(String urlhash) {
|
||||
this.count = 1;
|
||||
this.hashsample = urlhash;
|
||||
}
|
||||
public void inc() {
|
||||
this.count++;
|
||||
}
|
||||
}
|
||||
|
||||
public class hostnaventry {
|
||||
public int count;
|
||||
public String host;
|
||||
public hostnaventry(String host, int count) {
|
||||
this.host = host;
|
||||
this.count = count;
|
||||
}
|
||||
}
|
||||
|
||||
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
|
||||
ScoreCluster<String> score = new ScoreCluster<String>();
|
||||
for (Map.Entry<String, hoststat> hsentry: this.hostNavigator.entrySet()) {
|
||||
score.addScore(hsentry.getKey(), hsentry.getValue().count);
|
||||
}
|
||||
int rc = Math.min(maxentries, score.size());
|
||||
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
|
||||
String hosthash;
|
||||
hoststat hs;
|
||||
URLMetadataRow mr;
|
||||
yacyURL url;
|
||||
for (int i = 0; i < rc; i++) {
|
||||
hosthash = score.getMaxObject();
|
||||
hs = this.hostNavigator.get(hosthash);
|
||||
mr = wordIndex.metadata().load(hs.hashsample, null, 0);
|
||||
if (mr == null) continue;
|
||||
url = mr.metadata().url();
|
||||
if (url == null) continue;
|
||||
result.add(new hostnaventry(url.getHost(), score.getScore(hosthash)));
|
||||
score.deleteScore(hosthash);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean testFlags(final WordReference ientry) {
|
||||
if (query.constraint == null) return true;
|
||||
|
|
|
@ -21,25 +21,6 @@
|
|||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notice above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.yacy;
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user