added web structure analysis for a special domain that can be requested from the api.

Example:
http://localhost:8080/api/webstructure.xml?about=www.yacy.net
returns a xml with the following content:

<?xml version="1.0"?>
<webstructure>
<domains reference="reverse" count="1" maxref="300">
<domain host="www.yacy.net" id="FXg39Q" date="20090401">
  <citation host="java.sun.com" id="o-R3yY" count="1" />
  <citation host="yacy-suche.de" id="-KCLaB" count="1" />
  <citation host="suma-ev.de" id="VRAHIA" count="1" />
  <citation host="www.kit.edu" id="EMaLDQ" count="1" />
  <citation host="yacy.net" id="Fh1hyQ" count="1" />
  <citation host="www.fzk.de" id="V2Kl-A" count="1" />
  <citation host="en.wikipedia.org" id="rwtdfR" count="3" />
  <citation host="vimeo.com" id="MmdQDY" count="3" />
  <citation host="liebel.fzk.de" id="sX4ozA" count="6" />
</domain>
</domains>
</webstructure>


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5766 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-01 14:53:23 +00:00
parent b6c2167143
commit bd409fb7ba
3 changed files with 78 additions and 34 deletions

View File

@ -27,6 +27,7 @@
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -124,7 +125,8 @@ public class WebStructurePicture_p {
if (nextlayer == maxlayer) return mynodes;
nextlayer++;
final double radius = 1.0 / (1 << nextlayer);
final Map<String, Integer> next = structure.references(centerhash);
plasmaWebStructure.structureEntry sr = structure.references(centerhash);
final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references;
Map.Entry<String, Integer> entry;
String targethash, targethost;
// first set points to next hosts

View File

@ -22,6 +22,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Map;
@ -30,6 +31,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWebStructure;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyURL;
public class webstructure {
@ -37,39 +39,67 @@ public class webstructure {
final serverObjects prop = new serverObjects();
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
final Iterator<plasmaWebStructure.structureEntry> i = sb.webStructure.structureEntryIterator(latest);
int c = 0, d;
plasmaWebStructure.structureEntry sentry;
Map.Entry<String, Integer> refentry;
String refdom, refhash;
Integer refcount;
Iterator<Map.Entry<String, Integer>> k;
while (i.hasNext()) {
sentry = i.next();
prop.put("domains_" + c + "_hash", sentry.domhash);
prop.put("domains_" + c + "_domain", sentry.domain);
prop.put("domains_" + c + "_date", sentry.date);
k = sentry.references.entrySet().iterator();
d = 0;
refloop: while (k.hasNext()) {
refentry = k.next();
refhash = refentry.getKey();
refdom = sb.webStructure.resolveDomHash2DomString(refhash);
if (refdom == null) continue refloop;
prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom);
refcount = refentry.getValue();
prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
d++;
String about = ((post == null) ? null : post.get("about", null));
if (about != null) {
yacyURL url = null;
if (about.length() > 6) {
try {
url = new yacyURL(about, null);
about = url.hash().substring(6);
} catch (MalformedURLException e) {
about = null;
}
}
prop.put("domains_" + c + "_citations", d);
c++;
if (about != null) {
plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about);
if (sentry != null) {
reference(prop, 0, sentry, sb.webStructure);
prop.put("domains", 1);
} else {
prop.put("domains", 0);
}
} else {
prop.put("domains", 0);
}
} else {
final Iterator<plasmaWebStructure.structureEntry> i = sb.webStructure.structureEntryIterator(latest);
int c = 0;
plasmaWebStructure.structureEntry sentry;
while (i.hasNext()) {
sentry = i.next();
reference(prop, c, sentry, sb.webStructure);
c++;
}
prop.put("domains", c);
if (latest) sb.webStructure.joinOldNew();
}
prop.put("domains", c);
prop.put("maxref", plasmaWebStructure.maxref);
if (latest) sb.webStructure.joinOldNew();
// return rewrite properties
return prop;
}
public static void reference(serverObjects prop, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) {
prop.put("domains_" + c + "_hash", sentry.domhash);
prop.put("domains_" + c + "_domain", sentry.domain);
prop.put("domains_" + c + "_date", sentry.date);
Iterator<Map.Entry<String, Integer>> k = sentry.references.entrySet().iterator();
Map.Entry<String, Integer> refentry;
String refdom, refhash;
Integer refcount;
int d = 0;
refloop: while (k.hasNext()) {
refentry = k.next();
refhash = refentry.getKey();
refdom = ws.resolveDomHash2DomString(refhash);
if (refdom == null) continue refloop;
prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom);
refcount = refentry.getValue();
prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
d++;
}
prop.put("domains_" + c + "_citations", d);
}
}

View File

@ -221,17 +221,23 @@ public class plasmaWebStructure {
return s.toString();
}
public Map<String, Integer> references(final String domhash) {
public structureEntry references(final String domhash) {
// returns a map with a domhash(String):refcount(Integer) relation
assert domhash.length() == 6;
SortedMap<String, String> tailMap;
Map<String, Integer> h = new HashMap<String, Integer>();
String domain = "";
String date = "";
String ref;
synchronized (structure_old) {
tailMap = structure_old.tailMap(domhash);
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
h = refstr2map(tailMap.get(key));
domain = key.substring(7);
ref = tailMap.get(key);
date = ref.substring(0, 8);
h = refstr2map(ref);
}
}
}
@ -240,11 +246,16 @@ public class plasmaWebStructure {
if (!tailMap.isEmpty()) {
final String key = tailMap.firstKey();
if (key.startsWith(domhash)) {
h.putAll(refstr2map(tailMap.get(key)));
ref = tailMap.get(key);
if (domain.length() == 0) domain = key.substring(7);
if (date.length() == 0) date = ref.substring(0, 8);
assert domain == key.substring(7) : "domain = " + domain + ", key = " + key;
h.putAll(refstr2map(ref));
}
}
}
return h;
if (h.size() == 0) return null;
return new structureEntry(domhash, domain, date, h);
}
public int referencesCount(final String domhash) {
@ -302,7 +313,8 @@ public class plasmaWebStructure {
final String domhash = url.hash().substring(6);
// parse the new reference string and join it with the stored references
final Map<String, Integer> refs = references(domhash);
structureEntry structure = references(domhash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0;
String dom;
int c;