// plasmaWebStructure.java // ----------------------------- // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.05.2007 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.peers.graphics; import java.io.File; import java.io.Serializable; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.MicroDate; import net.yacy.kelondro.rwi.AbstractReference; import net.yacy.kelondro.rwi.Reference; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainerCache; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.LookAheadIterator; import net.yacy.search.Switchboard; public class WebStructureGraph { public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia) public static int maxhosts = 50000; // maximum number of hosts in web structure map private final static Log log = new Log("WebStructureGraph"); private final File structureFile; private final TreeMap structure_old; // ',' to {}* private final TreeMap structure_new; private final BlockingQueue publicRefDNSResolvingQueue; private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker; private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null); private static class leanrefObject { private final DigestURI url; private final Set globalRefURLs; private leanrefObject(final DigestURI url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } } public WebStructureGraph(final File structureFile) { this.structure_old = new TreeMap(); this.structure_new = new TreeMap(); this.structureFile = structureFile; this.publicRefDNSResolvingQueue = new LinkedBlockingQueue(); // load web structure Map loadedStructure; try { loadedStructure = (this.structureFile.exists()) ? FileUtils.loadMap(this.structureFile) : new TreeMap(); } catch ( final OutOfMemoryError e ) { loadedStructure = new TreeMap(); } if ( loadedStructure != null ) { this.structure_old.putAll(loadedStructure); } // delete out-dated entries in case the structure is too big if ( this.structure_old.size() > maxhosts ) { // fill a set with last-modified - dates of the structure final TreeSet delset = new TreeSet(); String key, value; for ( final Map.Entry entry : this.structure_old.entrySet() ) { key = entry.getKey(); value = entry.getValue(); if ( value.length() >= 8 ) { delset.add(value.substring(0, 8) + key); } } int delcount = this.structure_old.size() - (maxhosts * 9 / 10); final Iterator j = delset.iterator(); while ( (delcount > 0) && (j.hasNext()) ) { this.structure_old.remove(j.next().substring(8)); delcount--; } } this.publicRefDNSResolvingWorker = new PublicRefDNSResolvingProcess(); this.publicRefDNSResolvingWorker.start(); } private class PublicRefDNSResolvingProcess extends Thread { private PublicRefDNSResolvingProcess() { } @Override public void run() { leanrefObject lro; try { while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) { learnrefs(lro); } } catch ( final InterruptedException e ) { } } } public void generateCitationReference( final DigestURI url, final Document document, final Condenser condenser) { // generate citation reference if ( url.isLocal() ) { return; // we do this only for global urls } final Map hl = document.getHyperlinks(); final Iterator it = hl.keySet().iterator(); final HashSet globalRefURLs = new HashSet(); final String refhost = url.getHost(); MultiProtocolURI u; int maxref = 1000; while ( it.hasNext() && maxref-- > 0 ) { u = it.next(); if ( u == null ) { continue; } if ( refhost != null && u.getHost() != null && !u.getHost().equals(refhost) ) { // this is a global link globalRefURLs.add(u); } } final leanrefObject lro = new leanrefObject(url, globalRefURLs); if ( globalRefURLs.size() > 0 ) { try { if ( this.publicRefDNSResolvingWorker.isAlive() ) { this.publicRefDNSResolvingQueue.put(lro); } else { learnrefs(lro); } } catch ( final InterruptedException e ) { learnrefs(lro); } } } private void learnrefs(final leanrefObject lro) { final StringBuilder cpg = new StringBuilder(240); assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); //final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part String nexturlhash; for ( final MultiProtocolURI u : lro.globalRefURLs ) { if (Switchboard.getSwitchboard().shallTerminate()) break; final byte[] nexturlhashb = new DigestURI(u).hash(); assert nexturlhashb != null; if ( nexturlhashb != null ) { nexturlhash = ASCII.String(nexturlhashb); assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash; //assert !nexturlhash.substring(6).equals(refhashp); // this is a global link cpg.append(nexturlhash); // store complete hash assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); } } assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString(); learn(lro.url, cpg); } private static int refstr2count(final String refs) { if ( (refs == null) || (refs.length() <= 8) ) { return 0; } assert (refs.length() - 8) % 10 == 0 : "refs = " + refs + ", length = " + refs.length(); return (refs.length() - 8) / 10; } static Map refstr2map(final String refs) { if ( (refs == null) || (refs.length() <= 8) ) { return new HashMap(); } final Map map = new HashMap(); String c; final int refsc = refstr2count(refs); int d; for ( int i = 0; i < refsc; i++ ) { c = refs.substring(8 + i * 10, 8 + (i + 1) * 10); try { d = Integer.valueOf(c.substring(6), 16); } catch ( final NumberFormatException e ) { d = 1; } map.put(c.substring(0, 6), d); } return map; } private static String map2refstr(final Map map) { final StringBuilder s = new StringBuilder(map.size() * 10); s.append(GenericFormatter.SHORT_DAY_FORMATTER.format()); String h; for ( final Map.Entry entry : map.entrySet() ) { s.append(entry.getKey()); h = Integer.toHexString(entry.getValue().intValue()); final int hl = h.length(); if ( hl == 0 ) { s.append("0000"); } else if ( hl == 1 ) { s.append("000").append(h); } else if ( hl == 2 ) { s.append("00").append(h); } else if ( hl == 3 ) { s.append('0').append(h); } else if ( hl == 4 ) { s.append(h); } else { s.append("FFFF"); } } return s.toString(); } public StructureEntry outgoingReferences(final String hosthash) { // returns a map with a hosthash(String):refcount(Integer) relation assert hosthash.length() == 6; SortedMap tailMap; Map h = new HashMap(); String hostname = ""; String date = ""; String ref; synchronized ( this.structure_old ) { tailMap = this.structure_old.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { hostname = key.substring(7); ref = tailMap.get(key); date = ref.substring(0, 8); h = refstr2map(ref); } } } synchronized ( this.structure_new ) { tailMap = this.structure_new.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { ref = tailMap.get(key); if ( hostname.length() == 0 ) { hostname = key.substring(7); } if ( date.length() == 0 ) { date = ref.substring(0, 8); } h.putAll(refstr2map(ref)); } } } if ( h.isEmpty() ) { return null; } return new StructureEntry(hosthash, hostname, date, h); } public StructureEntry incomingReferences(final String hosthash) { final String hostname = hostHash2hostName(hosthash); if ( hostname == null ) { return null; } // collect the references WebStructureGraph.StructureEntry sentry; final HashMap hosthashes = new HashMap(); Iterator i = new StructureIterator(false); while ( i.hasNext() ) { sentry = i.next(); if ( sentry.references.containsKey(hosthash) ) { hosthashes.put(sentry.hosthash, sentry.references.get(hosthash)); } } i = new StructureIterator(true); while ( i.hasNext() ) { sentry = i.next(); if ( sentry.references.containsKey(hosthash) ) { hosthashes.put(sentry.hosthash, sentry.references.get(hosthash)); } } // construct a new structureEntry Object return new StructureEntry( hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), hosthashes); } public static class HostReferenceFactory implements ReferenceFactory, Serializable { private static final long serialVersionUID=7461135579006223155L; private static final Row hostReferenceRow = new Row( "String h-6, Cardinal m-4 {b256}, Cardinal c-4 {b256}", Base64Order.enhancedCoder); public HostReferenceFactory() { } @Override public Row getRow() { return hostReferenceRow; } @Override public HostReference produceSlow(final Entry e) { return new HostReference(e); } @Override public HostReference produceFast(final HostReference e) { return e; } } public static class HostReference extends AbstractReference implements Reference, Serializable { private static final long serialVersionUID=-9170091435821206765L; private final Row.Entry entry; public HostReference(final byte[] hostHash, final long modified, final int count) { assert (hostHash.length == 6) : "hostHash = " + ASCII.String(hostHash); this.entry = hostReferenceFactory.getRow().newEntry(); this.entry.setCol(0, hostHash); this.entry.setCol(1, MicroDate.microDateDays(modified)); this.entry.setCol(2, count); } public HostReference(final String json) { this.entry = hostReferenceFactory.getRow().newEntry(json, true); } public HostReference(final Row.Entry entry) { this.entry = entry; } @Override public String toPropertyForm() { return this.entry.toPropertyForm(':', true, true, false, true); } @Override public Entry toKelondroEntry() { return this.entry; } @Override public byte[] urlhash() { return this.entry.getPrimaryKeyBytes(); } public int count() { return (int) this.entry.getColLong(2); } @Override public long lastModified() { return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(1)); } @Override public void join(final Reference r) { // joins two entries into one entry final HostReference oe = (HostReference) r; // combine date final long o = oe.lastModified(); if ( lastModified() < o ) { this.entry.setCol(1, MicroDate.microDateDays(o)); } // combine count final int c = oe.count(); if ( count() < c ) { this.entry.setCol(2, c); } } @Override public Collection positions() { return new ArrayList(0); } } public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory(); public static ReferenceContainerCache hostReferenceIndexCache = null; public static long hostReferenceIndexCacheTime = 0; public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache public synchronized ReferenceContainerCache incomingReferences() { // we return a cache if the cache is filled and not stale if ( hostReferenceIndexCache != null && hostReferenceIndexCacheTime + hostReferenceIndexCacheTTL > System.currentTimeMillis() ) { return hostReferenceIndexCache; } // collect the references final ReferenceContainerCache idx = new ReferenceContainerCache(hostReferenceFactory, Base64Order.enhancedCoder, 6); // we iterate over all structure entries. // one structure entry has information that a specific host links to a list of other hosts incomingReferencesEnrich(idx, new StructureIterator(false), 3000); incomingReferencesEnrich(idx, new StructureIterator(true), 3000); // fill the cache again and set fill time hostReferenceIndexCache = idx; hostReferenceIndexCacheTime = System.currentTimeMillis(); //incomingReferencesTest(hostReferenceIndexCache); return hostReferenceIndexCache; } private void incomingReferencesEnrich( final ReferenceContainerCache idx, final Iterator structureIterator, final long time) { // we iterate over all structure entries. // one structure entry has information that a specific host links to a list of other hosts final long timeout = System.currentTimeMillis() + time; byte[] term; HostReference hr; WebStructureGraph.StructureEntry sentry; structureLoop: while ( structureIterator.hasNext() ) { sentry = structureIterator.next(); // then we loop over all the hosts that are linked from sentry.hosthash refloop: for ( final Map.Entry refhosthashandcounter : sentry.references .entrySet() ) { term = UTF8.getBytes(refhosthashandcounter.getKey()); try { hr = new HostReference( ASCII.getBytes(sentry.hosthash), GenericFormatter.SHORT_DAY_FORMATTER.parse(sentry.date).getTime(), refhosthashandcounter.getValue().intValue()); } catch ( final ParseException e ) { continue refloop; } // each term refers to an index entry. look if we already have such an entry ReferenceContainer r = idx.get(term, null); try { if ( r == null ) { r = new ReferenceContainer(hostReferenceFactory, term); r.add(hr); idx.add(r); } else { r.put(hr); } } catch ( final RowSpaceExceededException e ) { continue refloop; } } if ( System.currentTimeMillis() > timeout ) { break structureLoop; } } } /* private void incomingReferencesTest(ReferenceContainerCache idx) { for (ReferenceContainer references: idx) { log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash()))); Iterator referenceIterator = references.entries(); StringBuilder s = new StringBuilder(); HostReference reference; while (referenceIterator.hasNext()) { reference = referenceIterator.next(); s.append(reference.toPropertyForm()); log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references"); } } } */ public int referencesCount(final String hosthash) { // returns the number of hosts that are referenced by this hosthash assert hosthash.length() == 6 : "hosthash = " + hosthash; if ( hosthash == null || hosthash.length() != 6 ) { return 0; } SortedMap tailMap; int c = 0; synchronized ( this.structure_old ) { tailMap = this.structure_old.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { c = refstr2count(tailMap.get(key)); } } } synchronized ( this.structure_new ) { tailMap = this.structure_new.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { c += refstr2count(tailMap.get(key)); } } } return c; } public String hostHash2hostName(final String hosthash) { // returns the host as string, null if unknown assert hosthash.length() == 6; SortedMap tailMap; synchronized ( this.structure_old ) { tailMap = this.structure_old.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { return key.substring(7); } } } synchronized ( this.structure_new ) { tailMap = this.structure_new.tailMap(hosthash); if ( !tailMap.isEmpty() ) { final String key = tailMap.firstKey(); if ( key.startsWith(hosthash) ) { return key.substring(7); } } } return null; } private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) { final String hosthash = ASCII.String(url.hash(), 6, 6); // parse the new reference string and join it with the stored references final StructureEntry structure = outgoingReferences(hosthash); final Map refs = (structure == null) ? new HashMap() : structure.references; assert reference.length() % 12 == 0 : "reference.length() = " + reference.length() + ", reference = " + reference.toString(); String dom; int c; for ( int i = 0; i < reference.length() / 12; i++ ) { dom = reference.substring(i * 12 + 6, (i + 1) * 12); c = 0; if ( refs.containsKey(dom) ) { c = (refs.get(dom)).intValue(); } refs.put(dom, Integer.valueOf(++c)); } // check if the maxref is exceeded if ( refs.size() > maxref ) { int shrink = refs.size() - (maxref * 9 / 10); delloop: while ( shrink > 0 ) { // shrink the references: the entry with the smallest number of references is removed int minrefcount = Integer.MAX_VALUE; String minrefkey = null; findloop: for ( final Map.Entry entry : refs.entrySet() ) { if ( entry.getValue().intValue() < minrefcount ) { minrefcount = entry.getValue().intValue(); minrefkey = entry.getKey(); } if ( minrefcount == 1 ) { break findloop; } } // remove the smallest if ( minrefkey == null ) { break delloop; } refs.remove(minrefkey); shrink--; } } // store the map back to the structure synchronized ( this.structure_new ) { this.structure_new.put(hosthash + "," + url.getHost(), map2refstr(refs)); } } private static void joinStructure(final TreeMap into, final TreeMap from) { for ( final Map.Entry e : from.entrySet() ) { if ( into.containsKey(e.getKey()) ) { final Map s0 = refstr2map(into.get(e.getKey())); final Map s1 = refstr2map(e.getValue()); for ( final Map.Entry r : s1.entrySet() ) { if ( s0.containsKey(r.getKey()) ) { s0.put(r.getKey(), s0.get(r.getKey()).intValue() + r.getValue().intValue()); } else { s0.put(r.getKey(), r.getValue().intValue()); } } into.put(e.getKey(), map2refstr(s0)); } else { into.put(e.getKey(), e.getValue()); } } } public void joinOldNew() { synchronized ( this.structure_new ) { joinStructure(this.structure_old, this.structure_new); this.structure_new.clear(); } } public String hostWithMaxReferences() { // find host with most references String maxhost = null; int refsize, maxref = 0; synchronized ( this.structure_old ) { for ( final Map.Entry entry : this.structure_old.entrySet() ) { refsize = entry.getValue().length(); if ( refsize > maxref ) { maxref = refsize; maxhost = entry.getKey().substring(7); } } } synchronized ( this.structure_new ) { for ( final Map.Entry entry : this.structure_new.entrySet() ) { refsize = entry.getValue().length(); if ( refsize > maxref ) { maxref = refsize; maxhost = entry.getKey().substring(7); } } } return maxhost; } public Iterator structureEntryIterator(final boolean latest) { return new StructureIterator(latest); } private class StructureIterator extends LookAheadIterator implements Iterator { private final Iterator> i; private StructureIterator(final boolean latest) { this.i = ((latest) ? WebStructureGraph.this.structure_new : WebStructureGraph.this.structure_old) .entrySet() .iterator(); } @Override public StructureEntry next0() { Map.Entry entry = null; String dom = null, ref = ""; while ( this.i.hasNext() ) { entry = this.i.next(); ref = entry.getValue(); if ( (ref.length() - 8) % 10 != 0 ) { continue; } dom = entry.getKey(); if ( dom.length() >= 8 ) { break; } dom = null; } if ( entry == null || dom == null ) { return null; } assert (ref.length() - 8) % 10 == 0 : "refs = " + ref + ", length = " + ref.length(); return new StructureEntry( dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref)); } } public static class StructureEntry { public String hosthash; // the tail of the host hash public String hostname; // the host name public String date; // date of latest change public Map references; // a map from the referenced host hash to the number of referenced to that host private StructureEntry( final String hosthash, final String hostname, final String date, final Map references) { this.hosthash = hosthash; this.hostname = hostname; this.date = date; this.references = references; } } public synchronized void close() { // finish dns resolving queue if ( this.publicRefDNSResolvingWorker.isAlive() ) { log.logInfo("Waiting for the DNS Resolving Queue to terminate"); try { this.publicRefDNSResolvingQueue.put(leanrefObjectPOISON); this.publicRefDNSResolvingWorker.join(5000); } catch ( final InterruptedException e ) { } } // save to web structure file log.logInfo("Saving Web Structure File: new = " + this.structure_new.size() + " entries, old = " + this.structure_old.size() + " entries"); final long time = System.currentTimeMillis(); joinOldNew(); if ( this.structure_old.size() > 0 ) { synchronized ( this.structure_old ) { if ( this.structure_old.size() > 0 ) { FileUtils .saveMap( this.structureFile, this.structure_old, "Web Structure Syntax: ',' to {}*"); final long t = Math.max(1, System.currentTimeMillis() - time); log.logInfo("Saved Web Structure File: " + this.structure_old.size() + " entries in " + t + " milliseconds, " + (this.structure_old.size() * 1000 / t) + " entries/second"); } this.structure_old.clear(); } } } }