mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
removed unused code (HostStat)
This commit is contained in:
parent
d3a88eaecb
commit
76c53faeb2
|
@ -32,7 +32,6 @@ import java.util.ArrayList;
|
|||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
@ -56,7 +55,6 @@ import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
|||
import net.yacy.cora.federate.solr.instance.ShardInstance;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.sorting.ReversibleScoreMap;
|
||||
import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
import net.yacy.cora.storage.ZIPReader;
|
||||
import net.yacy.cora.storage.ZIPWriter;
|
||||
|
@ -87,7 +85,6 @@ public final class Fulltext {
|
|||
private final File segmentPath;
|
||||
private final File archivePath;
|
||||
private Export exportthread; // will have a export thread assigned if exporter is running
|
||||
private ArrayList<HostStat> statsDump;
|
||||
private InstanceMirror solrInstances;
|
||||
private final CollectionConfiguration collectionConfiguration;
|
||||
private final WebgraphConfiguration webgraphConfiguration;
|
||||
|
@ -98,7 +95,6 @@ public final class Fulltext {
|
|||
this.segmentPath = segmentPath;
|
||||
this.archivePath = archivePath;
|
||||
this.exportthread = null; // will have a export thread assigned if exporter is running
|
||||
this.statsDump = null;
|
||||
this.solrInstances = new InstanceMirror();
|
||||
this.collectionConfiguration = collectionConfiguration;
|
||||
this.webgraphConfiguration = webgraphConfiguration;
|
||||
|
@ -206,9 +202,7 @@ public final class Fulltext {
|
|||
}
|
||||
|
||||
public void clearCaches() {
|
||||
if (this.statsDump != null) this.statsDump.clear();
|
||||
this.solrInstances.clearCaches();
|
||||
this.statsDump = null;
|
||||
}
|
||||
|
||||
public void clearLocalSolr() throws IOException {
|
||||
|
@ -261,7 +255,6 @@ public final class Fulltext {
|
|||
}
|
||||
|
||||
public void close() {
|
||||
this.statsDump = null;
|
||||
try {
|
||||
this.solrInstances.close();
|
||||
} catch (Throwable e) {}
|
||||
|
@ -347,7 +340,6 @@ public final class Fulltext {
|
|||
} catch (final SolrException e) {
|
||||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
|
@ -359,7 +351,6 @@ public final class Fulltext {
|
|||
} catch (final SolrException e) {
|
||||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
|
@ -378,7 +369,6 @@ public final class Fulltext {
|
|||
} catch (final SolrException e) {
|
||||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
this.statsDump = null;
|
||||
if (MemoryControl.shortStatus()) clearCaches();
|
||||
}
|
||||
|
||||
|
@ -398,16 +388,6 @@ public final class Fulltext {
|
|||
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_id_s.getSolrFieldName(), hosthashes,
|
||||
(freshdate == null || freshdate.after(now)) ? null :
|
||||
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
|
||||
|
||||
// remove the line with statistics
|
||||
if (Fulltext.this.statsDump != null) {
|
||||
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
|
||||
HostStat hs;
|
||||
while (hsi.hasNext()) {
|
||||
hs = hsi.next();
|
||||
if (hosthashes.contains(hs.hosthash)) hsi.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
|
||||
|
@ -419,16 +399,6 @@ public final class Fulltext {
|
|||
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_s.getSolrFieldName(), hostnames,
|
||||
(freshdate == null || freshdate.after(now)) ? null :
|
||||
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
|
||||
|
||||
// finally remove the line with statistics
|
||||
if (Fulltext.this.statsDump != null) {
|
||||
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
|
||||
HostStat hs;
|
||||
while (hsi.hasNext()) {
|
||||
hs = hsi.next();
|
||||
if (hostnames.contains(hs.hostname)) hsi.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -790,42 +760,5 @@ public final class Fulltext {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainScore) {
|
||||
// prevent too heavy IO.
|
||||
if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();
|
||||
|
||||
// fetch urls from the database to determine the host in clear text
|
||||
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
|
||||
String urlhash;
|
||||
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
|
||||
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
|
||||
this.statsDump = new ArrayList<HostStat>();
|
||||
DigestURL url;
|
||||
while (j.hasNext()) {
|
||||
urlhash = j.next();
|
||||
if (urlhash == null) continue;
|
||||
url = this.getURL(ASCII.getBytes(urlhash));
|
||||
if (url == null || url.getHost() == null) continue;
|
||||
if (this.statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
|
||||
this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
|
||||
count--;
|
||||
if (count == 0) break;
|
||||
}
|
||||
// finally return an iterator for the result array
|
||||
return (this.statsDump == null) ? new ArrayList<HostStat>().iterator() : this.statsDump.iterator();
|
||||
}
|
||||
|
||||
public static class HostStat {
|
||||
public String hostname, hosthash;
|
||||
public int port;
|
||||
public int count;
|
||||
private HostStat(final String host, final int port, final String urlhashfragment, final int count) {
|
||||
assert urlhashfragment.length() == 6;
|
||||
this.hostname = host;
|
||||
this.port = port;
|
||||
this.hosthash = urlhashfragment;
|
||||
this.count = count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,13 +29,8 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.sorting.OrderedScoreMap;
|
||||
import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.kelondro.index.BinSearch;
|
||||
|
@ -47,10 +42,8 @@ import net.yacy.peers.Seed;
|
|||
import net.yacy.peers.SeedDB;
|
||||
import net.yacy.peers.graphics.WebStructureGraph;
|
||||
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
|
||||
import net.yacy.search.index.Fulltext.HostStat;
|
||||
import net.yacy.search.index.Segment;
|
||||
|
||||
|
||||
public class BlockRank {
|
||||
|
||||
/**
|
||||
|
@ -149,64 +142,6 @@ public class BlockRank {
|
|||
return index;
|
||||
}
|
||||
|
||||
public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, final Map<String, HostStat> hostHashResolver, final BinSearch[] referenceTable, int recusions) {
|
||||
|
||||
// first find out the maximum count of the hostHashResolver
|
||||
int maxHostCount = 1;
|
||||
for (final HostStat stat: hostHashResolver.values()) {
|
||||
if (stat.count > maxHostCount) maxHostCount = stat.count;
|
||||
}
|
||||
|
||||
// then just count the number of references. all other information from the index is not used because they cannot be trusted
|
||||
final ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering());
|
||||
HostStat hostStat;
|
||||
int hostCount;
|
||||
for (final ReferenceContainer<HostReference> container: index) {
|
||||
if (container.isEmpty()) continue;
|
||||
if (referenceTable == null) {
|
||||
hostStat = hostHashResolver.get(ASCII.String(container.getTermHash()));
|
||||
hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count);
|
||||
hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount);
|
||||
} else {
|
||||
int score = 0;
|
||||
final Iterator<HostReference> hri = container.entries();
|
||||
HostReference hr;
|
||||
while (hri.hasNext()) {
|
||||
hr = hri.next();
|
||||
hostStat = hostHashResolver.get(ASCII.String(hr.urlhash()));
|
||||
hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count);
|
||||
score += (17 - ranking(hr.urlhash(), referenceTable)) * maxHostCount / hostCount;
|
||||
}
|
||||
hostScore.set(container.getTermHash(), score);
|
||||
}
|
||||
}
|
||||
|
||||
// now divide the scores into two halves until the score map is empty
|
||||
final List<BinSearch> table = new ArrayList<BinSearch>();
|
||||
while (hostScore.size() > 10) {
|
||||
final List<byte[]> smallest = hostScore.lowerHalf();
|
||||
if (smallest.isEmpty()) break; // should never happen but this ensures termination of the loop
|
||||
ConcurrentLog.info("BlockRank", "index evaluation: computed partition of size " + smallest.size());
|
||||
table.add(new BinSearch(smallest, 6));
|
||||
for (final byte[] host: smallest) hostScore.delete(host);
|
||||
}
|
||||
if (!hostScore.isEmpty()) {
|
||||
final ArrayList<byte[]> list = new ArrayList<byte[]>();
|
||||
for (final byte[] entry: hostScore) list.add(entry);
|
||||
ConcurrentLog.info("BlockRank", "index evaluation: computed last partition of size " + list.size());
|
||||
table.add(new BinSearch(list, 6));
|
||||
}
|
||||
|
||||
// the last table entry has now a list of host hashes that has the most references
|
||||
final int binTables = Math.min(16, table.size());
|
||||
final BinSearch[] newTables = new BinSearch[binTables];
|
||||
for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1);
|
||||
|
||||
// re-use the new table for a recursion
|
||||
if (recusions == 0) return newTables;
|
||||
return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step
|
||||
}
|
||||
|
||||
public static int ranking(final byte[] hash, final BinSearch[] rankingTable) {
|
||||
if (rankingTable == null) return 16;
|
||||
byte[] hosthash;
|
||||
|
|
Loading…
Reference in New Issue
Block a user