mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
b45701d20f
This time it works like this: - each peer provides its ranking information using the yacy/idx.json servlet - peers with more than 1 GB ram will load this information from all other peers, combine that into one ranking table and store it locally. This happens during the start-up of the peer concurrently. The new generated file with the ranking information is at DATA/INDEX/<network>/QUEUES/hostIndex.blob - this index is then computed to generate a new fresh ranking table. Peers which can calculate their own ranking table will do that every start-up to get latest feature updates until the feature is stable - I computed new ranking tables as part of the distribition and commit it here also - the YBR feature must be enabled manually by setting the YBR value in the ranking servlet to level 15. A default configuration for that is also in the commit but it does not affect your current installation only fresh peers - a recursive block rank refinement is implemented but disabled at this point. it needs more testing Please play around with the ranking settings and see if this helped to make search results better. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7729 6c8d7289-2bf4-0310-a012-ef5d649a1542
665 lines
27 KiB
Java
665 lines
27 KiB
Java
// plasmaWebStructure.java
|
|
// -----------------------------
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 15.05.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.yacy.graphics;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.text.ParseException;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.SortedMap;
|
|
import java.util.TreeMap;
|
|
import java.util.TreeSet;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.document.UTF8;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.document.Document;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.index.Row;
|
|
import net.yacy.kelondro.index.Row.Entry;
|
|
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.order.Base64Order;
|
|
import net.yacy.kelondro.order.MicroDate;
|
|
import net.yacy.kelondro.rwi.AbstractReference;
|
|
import net.yacy.kelondro.rwi.Reference;
|
|
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
import net.yacy.kelondro.rwi.ReferenceContainerCache;
|
|
import net.yacy.kelondro.rwi.ReferenceFactory;
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
import net.yacy.kelondro.util.LookAheadIterator;
|
|
|
|
|
|
public class WebStructureGraph {
|
|
|
|
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
|
|
public static int maxhosts = 20000; // maximum number of hosts in web structure map
|
|
|
|
private final static Log log = new Log("WebStructureGraph");
|
|
|
|
private final File structureFile;
|
|
private final TreeMap<String, String> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
|
|
private final TreeMap<String, String> structure_new;
|
|
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue;
|
|
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
|
|
|
|
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null);
|
|
|
|
private static class leanrefObject {
|
|
private final DigestURI url;
|
|
private final Set<MultiProtocolURI> globalRefURLs;
|
|
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
|
|
this.url = url;
|
|
this.globalRefURLs = globalRefURLs;
|
|
}
|
|
}
|
|
|
|
public WebStructureGraph(final File structureFile) {
|
|
this.structure_old = new TreeMap<String, String>();
|
|
this.structure_new = new TreeMap<String, String>();
|
|
this.structureFile = structureFile;
|
|
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>();
|
|
|
|
// load web structure
|
|
Map<String, String> loadedStructure;
|
|
try {
|
|
loadedStructure = (this.structureFile.exists()) ? FileUtils.loadMap(this.structureFile) : new TreeMap<String, String>();
|
|
} catch (OutOfMemoryError e) {
|
|
loadedStructure = new TreeMap<String, String>();
|
|
}
|
|
if (loadedStructure != null) this.structure_old.putAll(loadedStructure);
|
|
|
|
// delete out-dated entries in case the structure is too big
|
|
if (this.structure_old.size() > maxhosts) {
|
|
// fill a set with last-modified - dates of the structure
|
|
final TreeSet<String> delset = new TreeSet<String>();
|
|
String key, value;
|
|
for (final Map.Entry<String, String> entry : this.structure_old.entrySet()) {
|
|
key = entry.getKey();
|
|
value = entry.getValue();
|
|
if (value.length() >= 8) delset.add(value.substring(0, 8) + key);
|
|
}
|
|
int delcount = this.structure_old.size() - (maxhosts * 9 / 10);
|
|
final Iterator<String> j = delset.iterator();
|
|
while ((delcount > 0) && (j.hasNext())) {
|
|
this.structure_old.remove(j.next().substring(8));
|
|
delcount--;
|
|
}
|
|
}
|
|
this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess();
|
|
this.publicRefDNSResolvingWorker.start();
|
|
}
|
|
|
|
private class PublicRefDNSResolvingProcess extends Thread {
|
|
private PublicRefDNSResolvingProcess() {
|
|
}
|
|
public void run() {
|
|
leanrefObject lro;
|
|
try {
|
|
while ((lro = publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON) {
|
|
learnrefs(lro);
|
|
}
|
|
} catch (InterruptedException e) {
|
|
}
|
|
}
|
|
}
|
|
|
|
public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser) {
|
|
// generate citation reference
|
|
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
|
|
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
|
|
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
|
|
final String refhost = url.getHost();
|
|
MultiProtocolURI u;
|
|
while (it.hasNext()) {
|
|
u = it.next();
|
|
if (u == null) continue;
|
|
if (refhost != null && u.getHost() != null && !u.getHost().equals(refhost)) {
|
|
// this is a global link
|
|
globalRefURLs.add(u);
|
|
}
|
|
}
|
|
leanrefObject lro = new leanrefObject(url, globalRefURLs);
|
|
if (globalRefURLs.size() > 0) try {
|
|
if (this.publicRefDNSResolvingWorker.isAlive()) {
|
|
this.publicRefDNSResolvingQueue.put(lro);
|
|
} else {
|
|
this.learnrefs(lro);
|
|
}
|
|
} catch (InterruptedException e) {
|
|
this.learnrefs(lro);
|
|
}
|
|
}
|
|
|
|
private void learnrefs(final leanrefObject lro) {
|
|
final StringBuilder cpg = new StringBuilder(240);
|
|
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
|
|
final String refhashp = UTF8.String(lro.url.hash(), 6, 6); // ref hash part
|
|
String nexturlhash;
|
|
for (MultiProtocolURI u: lro.globalRefURLs) {
|
|
byte[] nexturlhashb = new DigestURI(u).hash();
|
|
assert nexturlhashb != null;
|
|
if (nexturlhashb != null) {
|
|
nexturlhash = UTF8.String(nexturlhashb);
|
|
assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash;
|
|
assert !nexturlhash.substring(6).equals(refhashp);
|
|
// this is a global link
|
|
cpg.append(nexturlhash); // store complete hash
|
|
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
|
|
}
|
|
}
|
|
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
|
|
learn(lro.url, cpg);
|
|
}
|
|
|
|
private static int refstr2count(final String refs) {
|
|
if ((refs == null) || (refs.length() <= 8)) return 0;
|
|
assert (refs.length() - 8) % 10 == 0 : "refs = " + refs + ", length = " + refs.length();
|
|
return (refs.length() - 8) / 10;
|
|
}
|
|
|
|
static Map<String, Integer> refstr2map(final String refs) {
|
|
if ((refs == null) || (refs.length() <= 8)) return new HashMap<String, Integer>();
|
|
final Map<String, Integer> map = new HashMap<String, Integer>();
|
|
String c;
|
|
final int refsc = refstr2count(refs);
|
|
int d;
|
|
for (int i = 0; i < refsc; i++) {
|
|
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
|
|
try {
|
|
d = Integer.valueOf(c.substring(6), 16);
|
|
} catch (NumberFormatException e) {
|
|
d = 1;
|
|
}
|
|
map.put(c.substring(0, 6), d);
|
|
}
|
|
return map;
|
|
}
|
|
|
|
private static String map2refstr(final Map<String, Integer> map) {
|
|
final StringBuilder s = new StringBuilder(map.size() * 10);
|
|
s.append(GenericFormatter.SHORT_DAY_FORMATTER.format());
|
|
String h;
|
|
for (final Map.Entry<String, Integer> entry : map.entrySet()) {
|
|
s.append(entry.getKey());
|
|
h = Integer.toHexString(entry.getValue().intValue());
|
|
if (h.length() == 0) {
|
|
s.append("0000");
|
|
} else if (h.length() == 1) {
|
|
s.append("000").append(h);
|
|
} else if (h.length() == 2) {
|
|
s.append("00").append(h);
|
|
} else if (h.length() == 3) {
|
|
s.append('0').append(h);
|
|
} else if (h.length() == 4) {
|
|
s.append(h);
|
|
} else {
|
|
s.append("FFFF");
|
|
}
|
|
}
|
|
return s.toString();
|
|
}
|
|
|
|
public StructureEntry outgoingReferences(final String hosthash) {
|
|
// returns a map with a hosthash(String):refcount(Integer) relation
|
|
assert hosthash.length() == 6;
|
|
SortedMap<String, String> tailMap;
|
|
Map<String, Integer> h = new HashMap<String, Integer>();
|
|
String hostname = "";
|
|
String date = "";
|
|
String ref;
|
|
synchronized (structure_old) {
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
hostname = key.substring(7);
|
|
ref = tailMap.get(key);
|
|
date = ref.substring(0, 8);
|
|
h = refstr2map(ref);
|
|
}
|
|
}
|
|
}
|
|
synchronized (structure_new) {
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
ref = tailMap.get(key);
|
|
if (hostname.length() == 0) hostname = key.substring(7);
|
|
if (date.length() == 0) date = ref.substring(0, 8);
|
|
h.putAll(refstr2map(ref));
|
|
}
|
|
}
|
|
}
|
|
if (h.isEmpty()) return null;
|
|
return new StructureEntry(hosthash, hostname, date, h);
|
|
}
|
|
|
|
public StructureEntry incomingReferences(final String hosthash) {
|
|
String hostname = hostHash2hostName(hosthash);
|
|
if (hostname == null) return null;
|
|
// collect the references
|
|
WebStructureGraph.StructureEntry sentry;
|
|
HashMap<String, Integer> hosthashes = new HashMap<String, Integer>();
|
|
Iterator<WebStructureGraph.StructureEntry> i = new StructureIterator(false);
|
|
while (i.hasNext()) {
|
|
sentry = i.next();
|
|
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
|
|
}
|
|
i = new StructureIterator(true);
|
|
while (i.hasNext()) {
|
|
sentry = i.next();
|
|
if (sentry.references.containsKey(hosthash)) hosthashes.put(sentry.hosthash, sentry.references.get(hosthash));
|
|
}
|
|
// construct a new structureEntry Object
|
|
return new StructureEntry(
|
|
hosthash,
|
|
hostname,
|
|
GenericFormatter.SHORT_DAY_FORMATTER.format(),
|
|
hosthashes);
|
|
}
|
|
|
|
public static class HostReferenceFactory implements ReferenceFactory<HostReference> {
|
|
|
|
private static final Row hostReferenceRow = new Row("String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}", Base64Order.enhancedCoder);
|
|
|
|
public HostReferenceFactory() {
|
|
}
|
|
|
|
public Row getRow() {
|
|
return hostReferenceRow;
|
|
}
|
|
|
|
public HostReference produceSlow(Entry e) {
|
|
return new HostReference(e);
|
|
}
|
|
|
|
public HostReference produceFast(HostReference e) {
|
|
return e;
|
|
}
|
|
|
|
}
|
|
|
|
public static class HostReference extends AbstractReference implements Reference {
|
|
|
|
private final Row.Entry entry;
|
|
|
|
public HostReference(final byte[] hostHash, final long modified, final int count) {
|
|
assert (hostHash.length == 6) : "hostHash = " + UTF8.String(hostHash);
|
|
this.entry = hostReferenceFactory.getRow().newEntry();
|
|
this.entry.setCol(0, hostHash);
|
|
this.entry.setCol(1, MicroDate.microDateDays(modified));
|
|
this.entry.setCol(2, count);
|
|
}
|
|
|
|
public HostReference(final String json) {
|
|
this.entry = hostReferenceFactory.getRow().newEntry(json, true);
|
|
}
|
|
|
|
public HostReference(Row.Entry entry) {
|
|
this.entry = entry;
|
|
}
|
|
|
|
public String toPropertyForm() {
|
|
return this.entry.toPropertyForm(':', true, true, false, true);
|
|
}
|
|
|
|
public Entry toKelondroEntry() {
|
|
return this.entry;
|
|
}
|
|
|
|
public byte[] metadataHash() {
|
|
return this.entry.getPrimaryKeyBytes();
|
|
}
|
|
|
|
public int count() {
|
|
return (int) this.entry.getColLong(2);
|
|
}
|
|
|
|
public long lastModified() {
|
|
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1));
|
|
}
|
|
|
|
public void join(final Reference r) {
|
|
// joins two entries into one entry
|
|
HostReference oe = (HostReference) r;
|
|
|
|
// combine date
|
|
long o = oe.lastModified();
|
|
if (this.lastModified() < o) this.entry.setCol(1, MicroDate.microDateDays(o));
|
|
|
|
// combine count
|
|
int c = oe.count();
|
|
if (this.count() < c) this.entry.setCol(2, c);
|
|
}
|
|
|
|
public Collection<Integer> positions() {
|
|
return new ArrayList<Integer>(0);
|
|
}
|
|
}
|
|
|
|
public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
|
|
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
|
|
public static long hostReferenceIndexCacheTime = 0;
|
|
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
|
|
|
|
public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
|
|
// we return a cache if the cache is filled and not stale
|
|
if (hostReferenceIndexCache != null &&
|
|
hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis()) return hostReferenceIndexCache;
|
|
|
|
// collect the references
|
|
ReferenceContainerCache<HostReference> idx = new ReferenceContainerCache<HostReference>(hostReferenceFactory, Base64Order.enhancedCoder, 6);
|
|
|
|
// we iterate over all structure entries.
|
|
// one structure entry has information that a specific host links to a list of other hosts
|
|
incomingReferencesEnrich(idx, new StructureIterator(false), 3000);
|
|
incomingReferencesEnrich(idx, new StructureIterator(true), 3000);
|
|
|
|
// fill the cache again and set fill time
|
|
hostReferenceIndexCache = idx;
|
|
hostReferenceIndexCacheTime = System.currentTimeMillis();
|
|
//incomingReferencesTest(hostReferenceIndexCache);
|
|
return hostReferenceIndexCache;
|
|
}
|
|
|
|
private void incomingReferencesEnrich(
|
|
ReferenceContainerCache<HostReference> idx,
|
|
Iterator<WebStructureGraph.StructureEntry> structureIterator,
|
|
long time) {
|
|
// we iterate over all structure entries.
|
|
// one structure entry has information that a specific host links to a list of other hosts
|
|
long timeout = System.currentTimeMillis() + time;
|
|
byte[] term;
|
|
HostReference hr;
|
|
WebStructureGraph.StructureEntry sentry;
|
|
structureLoop: while (structureIterator.hasNext()) {
|
|
sentry = structureIterator.next();
|
|
// then we loop over all the hosts that are linked from sentry.hosthash
|
|
refloop: for (Map.Entry<String, Integer> refhosthashandcounter: sentry.references.entrySet()) {
|
|
term = UTF8.getBytes(refhosthashandcounter.getKey());
|
|
try {
|
|
hr = new HostReference(UTF8.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue());
|
|
} catch (ParseException e) {
|
|
continue refloop;
|
|
}
|
|
// each term refers to an index entry. look if we already have such an entry
|
|
ReferenceContainer<HostReference> r = idx.get(term, null);
|
|
try {
|
|
if (r == null) {
|
|
r = new ReferenceContainer<HostReference>(hostReferenceFactory, term);
|
|
r.add(hr);
|
|
idx.add(r);
|
|
} else {
|
|
r.put(hr);
|
|
}
|
|
} catch (RowSpaceExceededException e) {
|
|
continue refloop;
|
|
}
|
|
}
|
|
if (System.currentTimeMillis() > timeout) break structureLoop;
|
|
}
|
|
}
|
|
|
|
/*
|
|
private void incomingReferencesTest(ReferenceContainerCache<HostReference> idx) {
|
|
for (ReferenceContainer<HostReference> references: idx) {
|
|
log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash())));
|
|
Iterator<HostReference> referenceIterator = references.entries();
|
|
StringBuilder s = new StringBuilder();
|
|
HostReference reference;
|
|
while (referenceIterator.hasNext()) {
|
|
reference = referenceIterator.next();
|
|
s.append(reference.toPropertyForm());
|
|
log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references");
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
|
|
public int referencesCount(final String hosthash) {
|
|
// returns the number of hosts that are referenced by this hosthash
|
|
assert hosthash.length() == 6 : "hosthash = " + hosthash;
|
|
if (hosthash == null || hosthash.length() != 6) return 0;
|
|
SortedMap<String, String> tailMap;
|
|
int c = 0;
|
|
synchronized (structure_old) {
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
c = refstr2count(tailMap.get(key));
|
|
}
|
|
}
|
|
}
|
|
synchronized (structure_new) {
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
c += refstr2count(tailMap.get(key));
|
|
}
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
public String hostHash2hostName(final String hosthash) {
|
|
// returns the host as string, null if unknown
|
|
assert hosthash.length() == 6;
|
|
SortedMap<String, String> tailMap;
|
|
synchronized(structure_old) {
|
|
tailMap = structure_old.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
return key.substring(7);
|
|
}
|
|
}
|
|
}
|
|
synchronized(structure_new) {
|
|
tailMap = structure_new.tailMap(hosthash);
|
|
if (!tailMap.isEmpty()) {
|
|
final String key = tailMap.firstKey();
|
|
if (key.startsWith(hosthash)) {
|
|
return key.substring(7);
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) {
|
|
final String hosthash = UTF8.String(url.hash(), 6, 6);
|
|
|
|
// parse the new reference string and join it with the stored references
|
|
StructureEntry structure = outgoingReferences(hosthash);
|
|
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
|
|
assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString();
|
|
String dom;
|
|
int c;
|
|
for (int i = 0; i < reference.length() / 12; i++) {
|
|
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
|
|
c = 0;
|
|
if (refs.containsKey(dom)) {
|
|
c = (refs.get(dom)).intValue();
|
|
}
|
|
refs.put(dom, Integer.valueOf(++c));
|
|
}
|
|
|
|
// check if the maxref is exceeded
|
|
if (refs.size() > maxref) {
|
|
int shrink = refs.size() - (maxref * 9 / 10);
|
|
delloop: while (shrink > 0) {
|
|
// shrink the references: the entry with the smallest number of references is removed
|
|
int minrefcount = Integer.MAX_VALUE;
|
|
String minrefkey = null;
|
|
findloop: for (final Map.Entry<String, Integer> entry : refs.entrySet()) {
|
|
if (entry.getValue().intValue() < minrefcount) {
|
|
minrefcount = entry.getValue().intValue();
|
|
minrefkey = entry.getKey();
|
|
}
|
|
if (minrefcount == 1) break findloop;
|
|
}
|
|
// remove the smallest
|
|
if (minrefkey == null) break delloop;
|
|
refs.remove(minrefkey);
|
|
shrink--;
|
|
}
|
|
}
|
|
|
|
// store the map back to the structure
|
|
synchronized(structure_new) {
|
|
structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs));
|
|
}
|
|
}
|
|
|
|
private static void joinStructure(final TreeMap<String, String> into, final TreeMap<String, String> from) {
|
|
for (final Map.Entry<String, String> e: from.entrySet()) {
|
|
if (into.containsKey(e.getKey())) {
|
|
final Map<String, Integer> s0 = refstr2map(into.get(e.getKey()));
|
|
final Map<String, Integer> s1 = refstr2map(e.getValue());
|
|
for (final Map.Entry<String, Integer> r: s1.entrySet()) {
|
|
if (s0.containsKey(r.getKey())) {
|
|
s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue());
|
|
} else {
|
|
s0.put(r.getKey(), r.getValue().intValue());
|
|
}
|
|
}
|
|
into.put(e.getKey(), map2refstr(s0));
|
|
} else {
|
|
into.put(e.getKey(), e.getValue());
|
|
}
|
|
}
|
|
}
|
|
|
|
public void joinOldNew() {
|
|
synchronized(structure_new) {
|
|
joinStructure(this.structure_old, this.structure_new);
|
|
this.structure_new.clear();
|
|
}
|
|
}
|
|
|
|
private void saveWebStructure() {
|
|
joinOldNew();
|
|
try {
|
|
synchronized(structure_old) {
|
|
FileUtils.saveMap(this.structureFile, this.structure_old, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
|
|
}
|
|
} catch (final IOException e) {
|
|
Log.logException(e);
|
|
}
|
|
}
|
|
|
|
public String hostWithMaxReferences() {
|
|
// find host with most references
|
|
String maxhost = null;
|
|
int refsize, maxref = 0;
|
|
joinOldNew();
|
|
synchronized(structure_new) {
|
|
for (final Map.Entry<String, String> entry : structure_old.entrySet()) {
|
|
refsize = entry.getValue().length();
|
|
if (refsize > maxref) {
|
|
maxref = refsize;
|
|
maxhost = entry.getKey().substring(7);
|
|
}
|
|
}
|
|
}
|
|
return maxhost;
|
|
}
|
|
|
|
public Iterator<StructureEntry> structureEntryIterator(final boolean latest) {
|
|
return new StructureIterator(latest);
|
|
}
|
|
|
|
private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> {
|
|
|
|
private final Iterator<Map.Entry<String, String>> i;
|
|
|
|
private StructureIterator(final boolean latest) {
|
|
i = ((latest) ? structure_new : structure_old).entrySet().iterator();
|
|
}
|
|
|
|
public StructureEntry next0() {
|
|
Map.Entry<String, String> entry = null;
|
|
String dom = null, ref = "";
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
ref = entry.getValue();
|
|
if ((ref.length() - 8) % 10 != 0) continue;
|
|
dom = entry.getKey();
|
|
if (dom.length() >= 8) break;
|
|
dom = null;
|
|
}
|
|
if (entry == null || dom == null) return null;
|
|
assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length();
|
|
return new StructureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
|
|
}
|
|
}
|
|
|
|
public static class StructureEntry {
|
|
public String hosthash; // the tail of the host hash
|
|
public String hostname; // the host name
|
|
public String date; // date of latest change
|
|
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
|
|
private StructureEntry(
|
|
final String hosthash,
|
|
final String hostname,
|
|
final String date,
|
|
final Map<String, Integer> references) {
|
|
this.hosthash = hosthash;
|
|
this.hostname = hostname;
|
|
this.date = date;
|
|
this.references = references;
|
|
}
|
|
}
|
|
|
|
public void close() {
|
|
if (this.publicRefDNSResolvingWorker.isAlive()) {
|
|
log.logInfo("Waiting for the DNS Resolving Queue to terminate");
|
|
try {
|
|
this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON);
|
|
this.publicRefDNSResolvingWorker.join(5000);
|
|
} catch (InterruptedException e) {
|
|
}
|
|
}
|
|
log.logInfo("Saving Web Structure File");
|
|
saveWebStructure();
|
|
}
|
|
}
|