mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
daf0f74361
search profiling showed, that a major amount of time is wasted by computing url hashes. The computation does an intranet-check, which needs a DNS lookup. This caused that each urlhash computation needed 100-200 milliseconds, which caused remote searches to delay at least 1 second more that necessary. The solution to this problem is to attach a URL hash to the URL data structure, because that means that the url hash value can be filled after retrieval of the URL from the database. The redesign of the url/urlhash management caused a major redesign of many parts of the software. Since some parts had been decided to be given up they had been removed during this change to avoid unnecessary maintenance of unused code. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4074 6c8d7289-2bf4-0310-a012-ef5d649a1542
413 lines
16 KiB
Java
413 lines
16 KiB
Java
// plasmaWebStructure.java
|
|
// -----------------------------
|
|
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
|
// first published 15.05.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.plasma;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.net.MalformedURLException;
|
|
import java.util.ConcurrentModificationException;
|
|
import java.util.Date;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.HashMap;
|
|
import java.util.TreeMap;
|
|
import java.util.SortedMap;
|
|
import java.util.TreeSet;
|
|
|
|
import de.anomic.kelondro.kelondroBase64Order;
|
|
import de.anomic.server.serverDate;
|
|
import de.anomic.server.serverFileUtils;
|
|
import de.anomic.server.logging.serverLog;
|
|
import de.anomic.yacy.yacyURL;
|
|
|
|
public class plasmaWebStructure {
|
|
|
|
public static int maxCRLDump = 500000;
|
|
public static int maxCRGDump = 200000;
|
|
public static int maxref = 200; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
|
|
public static int maxhosts = 4000; // maximum number of hosts in web structure map
|
|
|
|
private StringBuffer crg; // global citation references
|
|
private serverLog log;
|
|
private File rankingPath, structureFile;
|
|
private String crlFile, crgFile;
|
|
private TreeMap structure; // String2String with <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
|
|
|
|
public plasmaWebStructure(serverLog log, File rankingPath, String crlFile, String crgFile, File structureFile) {
|
|
this.log = log;
|
|
this.rankingPath = rankingPath;
|
|
this.crlFile = crlFile;
|
|
this.crgFile = crgFile;
|
|
this.crg = new StringBuffer(maxCRGDump);
|
|
this.structure = new TreeMap();
|
|
this.structureFile = structureFile;
|
|
|
|
// load web structure
|
|
Map loadedStructure = serverFileUtils.loadHashMap(this.structureFile);
|
|
if (loadedStructure != null) this.structure.putAll(loadedStructure);
|
|
|
|
// delete outdated entries in case the structure is too big
|
|
if (this.structure.size() > maxhosts) {
|
|
// fill a set with last-modified - dates of the structure
|
|
TreeSet delset = new TreeSet();
|
|
Map.Entry entry;
|
|
Iterator i = this.structure.entrySet().iterator();
|
|
String key, value;
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
key = (String) entry.getKey();
|
|
value = (String) entry.getValue();
|
|
delset.add(value.substring(0, 8) + key);
|
|
}
|
|
int delcount = this.structure.size() - (maxhosts * 9 / 10);
|
|
i = delset.iterator();
|
|
while ((delcount > 0) && (i.hasNext())) {
|
|
this.structure.remove(((String) i.next()).substring(8));
|
|
delcount--;
|
|
}
|
|
}
|
|
}
|
|
|
|
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(yacyURL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
|
|
assert url.hash().equals(baseurlhash);
|
|
|
|
// generate citation reference
|
|
Map hl = document.getHyperlinks();
|
|
Iterator it = hl.entrySet().iterator();
|
|
String nexturlhash;
|
|
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
|
|
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
|
|
String lhp = baseurlhash.substring(6); // local hash part
|
|
int GCount = 0;
|
|
int LCount = 0;
|
|
while (it.hasNext()) {
|
|
try {
|
|
nexturlhash = (new yacyURL((String) ((Map.Entry) it.next()).getKey(), null)).hash();
|
|
if (nexturlhash != null) {
|
|
if (nexturlhash.substring(6).equals(lhp)) {
|
|
// this is a inbound link
|
|
cpl.append(nexturlhash.substring(0, 6)); // store only local part
|
|
LCount++;
|
|
} else {
|
|
// this is a outbound link
|
|
cpg.append(nexturlhash); // store complete hash
|
|
GCount++;
|
|
}
|
|
}
|
|
} catch (MalformedURLException e) {}
|
|
}
|
|
|
|
// append this reference to buffer
|
|
// generate header info
|
|
String head = baseurlhash + "=" +
|
|
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
|
|
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
|
|
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
|
|
|
|
//crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
|
|
crg.append(head); crg.append('|'); crg.append(cpg); crg.append((char) 13); crg.append((char) 10);
|
|
|
|
learn(url, cpg);
|
|
|
|
// if buffer is full, flush it.
|
|
/*
|
|
if (crl.length() > maxCRLDump) {
|
|
flushCitationReference(crl, "crl");
|
|
crl = new StringBuffer(maxCRLDump);
|
|
}
|
|
**/
|
|
if (crg.length() > maxCRGDump) {
|
|
flushCitationReference("crg");
|
|
crg = new StringBuffer(maxCRGDump);
|
|
}
|
|
|
|
return new Integer[] {new Integer(LCount), new Integer(GCount)};
|
|
}
|
|
|
|
public void flushCitationReference(String type) {
|
|
if (crg.length() < 12) return;
|
|
String filename = type.toUpperCase() + "-A-" + new serverDate().toShortString(true) + "." + crg.substring(0, 12) + ".cr.gz";
|
|
File path = new File(rankingPath, (type.equals("crl")) ? crlFile : crgFile);
|
|
path.mkdirs();
|
|
File file = new File(path, filename);
|
|
|
|
// generate header
|
|
StringBuffer header = new StringBuffer(200);
|
|
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
|
|
header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10);
|
|
header.append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-" + ((type.equals("crl")) ? "6" : "12") + ">"); header.append((char) 13); header.append((char) 10);
|
|
header.append("# ---"); header.append((char) 13); header.append((char) 10);
|
|
crg.insert(0, header.toString());
|
|
try {
|
|
serverFileUtils.writeAndGZip(crg.toString().getBytes(), file);
|
|
log.logFine("wrote citation reference dump " + file.toString());
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
private static int refstr2count(String refs) {
|
|
if ((refs == null) || (refs.length() <= 8)) return 0;
|
|
assert (refs.length() - 8) % 10 == 0;
|
|
return (refs.length() - 8) / 10;
|
|
}
|
|
|
|
private static Map refstr2map(String refs) {
|
|
if ((refs == null) || (refs.length() <= 8)) return new HashMap();
|
|
Map map = new HashMap();
|
|
String c;
|
|
int refsc = refstr2count(refs);
|
|
for (int i = 0; i < refsc; i++) {
|
|
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
|
|
map.put(c.substring(0, 6), new Integer(Integer.parseInt(c.substring(6), 16)));
|
|
}
|
|
return map;
|
|
}
|
|
|
|
private static String map2refstr(Map map) {
|
|
StringBuffer s = new StringBuffer(map.size() * 10);
|
|
s.append(serverDate.shortDayFormatter.format(new Date()));
|
|
Iterator i = map.entrySet().iterator();
|
|
Map.Entry entry;
|
|
String h;
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
s.append((String) entry.getKey());
|
|
h = Integer.toHexString(((Integer) entry.getValue()).intValue());
|
|
if (h.length() == 0) {
|
|
s.append("0000");
|
|
} else if (h.length() == 1) {
|
|
s.append("000").append(h);
|
|
} else if (h.length() == 2) {
|
|
s.append("00").append(h);
|
|
} else if (h.length() == 3) {
|
|
s.append('0').append(h);
|
|
} else if (h.length() == 4) {
|
|
s.append(h);
|
|
} else {
|
|
s.append("FFFF");
|
|
}
|
|
}
|
|
return s.toString();
|
|
}
|
|
|
|
public Map references(String domhash) {
|
|
// returns a map with a domhash(String):refcount(Integer) relation
|
|
assert domhash.length() == 6;
|
|
SortedMap tailMap = structure.tailMap(domhash);
|
|
if ((tailMap == null) || (tailMap.size() == 0)) return new HashMap();
|
|
String key = (String) tailMap.firstKey();
|
|
if (key.startsWith(domhash)) {
|
|
return refstr2map((String) tailMap.get(key));
|
|
} else {
|
|
return new HashMap();
|
|
}
|
|
}
|
|
|
|
public int referencesCount(String domhash) {
|
|
// returns the number of domains that are referenced by this domhash
|
|
assert domhash.length() == 6 : "domhash = " + domhash;
|
|
try {
|
|
SortedMap tailMap = structure.tailMap(domhash);
|
|
if ((tailMap == null) || (tailMap.size() == 0)) return 0;
|
|
String key = (String) tailMap.firstKey();
|
|
if (key.startsWith(domhash)) {
|
|
return refstr2count((String) tailMap.get(key));
|
|
} else {
|
|
return 0;
|
|
}
|
|
} catch (ConcurrentModificationException e) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
public String resolveDomHash2DomString(String domhash) {
|
|
// returns the domain as string, null if unknown
|
|
assert domhash.length() == 6;
|
|
try {
|
|
SortedMap tailMap = structure.tailMap(domhash);
|
|
if ((tailMap == null) || (tailMap.size() == 0)) return null;
|
|
String key = (String) tailMap.firstKey();
|
|
if (key.startsWith(domhash)) {
|
|
return key.substring(7);
|
|
} else {
|
|
return null;
|
|
}
|
|
} catch (ConcurrentModificationException e) {
|
|
// we dont want to implement a synchronization here,
|
|
// because this is 'only' used for a graphics application
|
|
// just return null
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private void learn(yacyURL url, StringBuffer reference /*string of b64(12digits)-hashes*/) {
|
|
String domhash = url.hash().substring(6);
|
|
|
|
// parse the new reference string and join it with the stored references
|
|
Map refs = references(domhash);
|
|
assert reference.length() % 12 == 0;
|
|
String dom;
|
|
int c;
|
|
for (int i = 0; i < reference.length() / 12; i++) {
|
|
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
|
|
c = 0;
|
|
if (refs.containsKey(dom)) {
|
|
c = ((Integer) refs.get(dom)).intValue();
|
|
}
|
|
refs.put(dom, new Integer(++c));
|
|
}
|
|
|
|
// check if the maxref is exceeded
|
|
if (refs.size() > maxref) {
|
|
int shrink = refs.size() - (maxref * 9 / 10);
|
|
delloop: while (shrink > 0) {
|
|
// shrink the references: the entry with the smallest number of references is removed
|
|
int minrefcount = Integer.MAX_VALUE;
|
|
String minrefkey = null;
|
|
Iterator i = refs.entrySet().iterator();
|
|
Map.Entry entry;
|
|
findloop: while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
if (((Integer) entry.getValue()).intValue() < minrefcount) {
|
|
minrefcount = ((Integer) entry.getValue()).intValue();
|
|
minrefkey = (String) entry.getKey();
|
|
}
|
|
if (minrefcount == 1) break findloop;
|
|
}
|
|
// remove the smallest
|
|
if (minrefkey == null) break delloop;
|
|
refs.remove(minrefkey);
|
|
shrink--;
|
|
}
|
|
}
|
|
|
|
// store the map back to the structure
|
|
structure.put(domhash + "," + url.getHost(), map2refstr(refs));
|
|
}
|
|
|
|
public void saveWebStructure() {
|
|
try {
|
|
serverFileUtils.saveMap(this.structureFile, this.structure, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
public String hostWithMaxReferences() {
|
|
// find domain with most references
|
|
Iterator i = structure.entrySet().iterator();
|
|
int refsize, maxref = 0;
|
|
String maxhost = null;
|
|
Map.Entry entry;
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
refsize = ((String) entry.getValue()).length();
|
|
if (refsize > maxref) {
|
|
maxref = refsize;
|
|
maxhost = ((String) entry.getKey()).substring(7);
|
|
}
|
|
}
|
|
return maxhost;
|
|
}
|
|
|
|
public Iterator structureEntryIterator() {
|
|
// iterates objects of type structureEntry
|
|
return new structureIterator();
|
|
}
|
|
|
|
public class structureIterator implements Iterator {
|
|
|
|
private Iterator i;
|
|
private structureEntry nextentry;
|
|
|
|
public structureIterator() {
|
|
i = structure.entrySet().iterator();
|
|
next0();
|
|
}
|
|
|
|
public boolean hasNext() {
|
|
return nextentry != null;
|
|
}
|
|
|
|
private void next0() {
|
|
Map.Entry entry = null;
|
|
String dom = null, ref;
|
|
while (i.hasNext()) {
|
|
entry = (Map.Entry) i.next();
|
|
dom = (String) entry.getKey();
|
|
if (dom.length() >= 8) break;
|
|
if (!i.hasNext()) {
|
|
nextentry = null;
|
|
return;
|
|
}
|
|
}
|
|
if ((entry == null) || (dom == null)) {
|
|
nextentry = null;
|
|
return;
|
|
}
|
|
ref = (String) entry.getValue();
|
|
nextentry = new structureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
|
|
}
|
|
|
|
public Object next() {
|
|
structureEntry r = nextentry;
|
|
next0();
|
|
return r;
|
|
}
|
|
|
|
public void remove() {
|
|
throw new UnsupportedOperationException("not implemented");
|
|
}
|
|
|
|
}
|
|
|
|
public class structureEntry {
|
|
public String domhash, domain, date;
|
|
public Map references;
|
|
public structureEntry(String domhash, String domain, String date, Map references) {
|
|
this.domhash = domhash;
|
|
this.domain = domain;
|
|
this.date = date;
|
|
this.references = references;
|
|
}
|
|
}
|
|
|
|
public void close() {
|
|
log.logInfo("Saving Web Structure File");
|
|
saveWebStructure();
|
|
}
|
|
}
|