mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
e048e74072
This new "documentStructure" parameter can be set to false to only get hosts accumulated references on a resource and thus prevent scraping the specified URL and getting citations references. Also set WebStructureGraph constants as final and updated the Javadoc with example api call URLs.
315 lines
15 KiB
Java
315 lines
15 KiB
Java
// webstructure.java
|
|
// ------------
|
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
|
// first published 01.05.2008 on http://yacy.net
|
|
//
|
|
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
|
|
// $LastChangedRevision: 5723 $
|
|
// $LastChangedBy: borg-0300 $
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.net.MalformedURLException;
|
|
import java.util.Date;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.peers.graphics.WebStructureGraph;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.search.index.Segment.ReferenceReport;
|
|
import net.yacy.search.index.Segment.ReferenceReportCache;
|
|
import net.yacy.server.serverObjects;
|
|
import net.yacy.server.serverSwitch;
|
|
|
|
/**
|
|
* Retrieval of a web links structure.
|
|
*/
|
|
public class webstructure {
|
|
|
|
/**
|
|
* <p>Retrieve the locally known web links structure of a specified resource ("about" parameter supplied) or
|
|
* the whole computed links structure since install (no parameter supplied)
|
|
* or since last start or last call ("latest" parameter supplied).</p>
|
|
* <p>Returned object contains the following information :
|
|
* <ul>
|
|
* <li>in all cases :
|
|
* <ul>
|
|
* <li>accumulated list of outgoing links to other domains (per host accumulated anchors)</li>
|
|
* </ul>
|
|
* </li>
|
|
* <li>when "about" parameter is filled :
|
|
* <ul>
|
|
* <li>accumulated list of incoming links from other domains (per host accumulated references)</li>
|
|
* <li>detailed list of outgoing links (anchors) from document at "about" URL to references</li>
|
|
* <li>detailed list of incoming links (citations) from other documents (their references) - reverse link structure</li>
|
|
* </ul>
|
|
* </li>
|
|
* </ul>
|
|
* <p>
|
|
* Remarks :
|
|
* <ul>
|
|
* <li>Information detail is limited by {@link WebStructureGraph#maxhosts}, {@link WebStructureGraph#maxref} and {@link WebStructureGraph#MAX_PARSED_ANCHORS} constants.</li>
|
|
* <li>Requesting client must be authenticated (as admin or requesting from localhost enabled) otherwise results will be empty</li>
|
|
* </ul>
|
|
* </p>
|
|
*
|
|
* <p>
|
|
* Example API calls :
|
|
* <ul>
|
|
* <li>domain name and index page structure : http://localhost:8090/api/webstructure.xml?about=yacy.net</li>
|
|
* <li>domain name structure : http://localhost:8090/api/webstructure.xml?about=yacy.net&documentStructure=false</li>
|
|
* <li>hosts accumulated structure and specific resource structure : http://localhost:8090/api/webstructure.xml?about=http://yacy.net/fr/API.html</li>
|
|
* <li>whole locally known hosts web structure : http://localhost:8090/api/webstructure.xml</li>
|
|
* <li>recently locally computed hosts web structure : http://localhost:8090/api/webstructure.xml?latest=</li>
|
|
* </ul>
|
|
* </p>
|
|
*
|
|
*
|
|
* @param header
|
|
* servlet request header
|
|
* @param post
|
|
* request parameters. Supported keys :
|
|
* <ul>
|
|
* <li>about : get only links structure about the resource
|
|
* specified as value. Supported values : host hash, URL hash,
|
|
* host name or URL</li>
|
|
* <li>latest (ignored when about parameter is valued): get the structure that have been computed during
|
|
* the current run-time of YaCy, and with each next call only an
|
|
* update to the next list of references.</li>
|
|
* <li>agentName : name of the user agent string used to load the "about" resource</li>
|
|
* <li>documentStructure : set to false when you only want the hosts accumulated references for the "about" resource</li>
|
|
* </ul>
|
|
* @param env
|
|
* server environment
|
|
* @return the servlet answer object
|
|
* @see WebStructureGraph
|
|
*/
|
|
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
final serverObjects prop = new serverObjects();
|
|
final Switchboard sb = (Switchboard) env;
|
|
String about = post == null ? null : post.get("about", null); // may be a URL, a URL hash or a domain hash
|
|
prop.put("out", 0);
|
|
prop.put("in", 0);
|
|
prop.put("references", 0);
|
|
prop.put("citations", 0);
|
|
boolean authenticated = sb.adminAuthenticated(header) >= 2;
|
|
if (about != null) {
|
|
DigestURL url = null;
|
|
byte[] urlhash = null;
|
|
Set<String> hostHashes = new HashSet<>();
|
|
if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
|
hostHashes.add(about);
|
|
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
|
urlhash = ASCII.getBytes(about);
|
|
hostHashes.add(about.substring(6));
|
|
try {
|
|
url = authenticated ? sb.getURL(urlhash) : null;
|
|
} catch (IOException e) {
|
|
url = null;
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
} else if (about.length() > 0) {
|
|
// consider "about" as url or hostname
|
|
try {
|
|
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
|
|
urlhash = url.hash();
|
|
if(about.indexOf("://") >= 0) {
|
|
hostHashes.add(url.hosthash());
|
|
} else {
|
|
hostHashes.addAll(sb.webStructure.hostName2HostHashes(about));
|
|
}
|
|
} catch (final MalformedURLException e) {
|
|
}
|
|
}
|
|
if (!hostHashes.isEmpty()) {
|
|
prop.put("out", 1);
|
|
prop.put("in", 1);
|
|
int inCount = 0, outCount = 0;
|
|
for(final String hostHash: hostHashes) {
|
|
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hostHash);
|
|
if (sentry != null && sentry.references.size() > 0) {
|
|
reference(prop, "out", outCount, sentry, sb.webStructure);
|
|
outCount++;
|
|
} else {
|
|
prop.put("out_domains", 0);
|
|
}
|
|
sentry = sb.webStructure.incomingReferences(hostHash);
|
|
if (sentry != null && sentry.references.size() > 0) {
|
|
reference(prop, "in", inCount, sentry, sb.webStructure);
|
|
prop.put("in_domains", 1);
|
|
inCount++;
|
|
} else {
|
|
prop.put("in_domains", 0);
|
|
}
|
|
}
|
|
prop.put("out_domains", outCount);
|
|
prop.put("in_domains", inCount);
|
|
}
|
|
|
|
/*
|
|
* It is possible not to scrape document and look for citations by
|
|
* setting documentStructure parameter to "false"
|
|
*/
|
|
boolean documentStructure = true;
|
|
if (post != null && "false".equals(post.get("documentStructure", "true"))) {
|
|
documentStructure = false;
|
|
}
|
|
|
|
if (urlhash != null && documentStructure) {
|
|
// anchors
|
|
prop.put("references", 1);
|
|
net.yacy.document.Document scraper = null;
|
|
if (url != null) try {
|
|
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
|
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, agent);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
if (scraper != null) {
|
|
prop.put("references_count", 1);
|
|
prop.put("references_documents", 1);
|
|
prop.put("references_documents_0_hash", urlhash);
|
|
prop.put("references_documents_0_count", scraper.inboundLinks().size() + scraper.outboundLinks().size());
|
|
prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date()));
|
|
prop.put("references_documents_0_urle", url == null ? 0 : 1);
|
|
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true));
|
|
int d = 0;
|
|
Iterator<DigestURL> i = scraper.inboundLinks().keySet().iterator();
|
|
while (i.hasNext()) {
|
|
DigestURL refurl = i.next();
|
|
byte[] refhash = refurl.hash();
|
|
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
|
|
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
|
|
prop.put("references_documents_0_anchors_" + d + "_outbound", 0);
|
|
d++;
|
|
}
|
|
i = scraper.outboundLinks().keySet().iterator();
|
|
while (i.hasNext()) {
|
|
DigestURL refurl = i.next();
|
|
byte[] refhash = refurl.hash();
|
|
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
|
|
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
|
|
prop.put("references_documents_0_anchors_" + d + "_outbound", 1);
|
|
d++;
|
|
}
|
|
prop.put("references_documents_0_count", d);
|
|
prop.put("references_documents_0_anchors", d);
|
|
} else {
|
|
prop.put("references_count", 0);
|
|
prop.put("references_documents", 0);
|
|
}
|
|
|
|
// citations
|
|
prop.put("citations", 1);
|
|
ReferenceReportCache rrc = sb.index.getReferenceReportCache();
|
|
ReferenceReport rr = null;
|
|
try {rr = rrc.getReferenceReport(ASCII.String(urlhash), true);} catch (IOException e) {}
|
|
if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) {
|
|
prop.put("citations_count", 1);
|
|
prop.put("citations_documents", 1);
|
|
prop.put("citations_documents_0_hash", urlhash);
|
|
prop.put("citations_documents_0_count", rr.getInternalCount() + rr.getExternalCount());
|
|
prop.put("citations_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
|
prop.put("citations_documents_0_urle", url == null ? 0 : 1);
|
|
if (url != null) prop.putXML("citations_documents_0_urle_url", url.toNormalform(true));
|
|
int d = 0;
|
|
HandleSet ids = rr.getInternallIDs();
|
|
try {ids.putAll(rr.getExternalIDs());} catch (SpaceExceededException e) {}
|
|
Iterator<byte[]> i = ids.iterator();
|
|
while (i.hasNext()) {
|
|
byte[] refhash = i.next();
|
|
DigestURL refurl;
|
|
try {
|
|
refurl = authenticated ? sb.getURL(refhash) : null;
|
|
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
|
|
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
|
|
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
|
|
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
|
d++;
|
|
} catch (IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
prop.put("citations_documents_0_count", d);
|
|
prop.put("citations_documents_0_anchors", d);
|
|
} else {
|
|
prop.put("citations_count", 0);
|
|
prop.put("citations_documents", 0);
|
|
}
|
|
}
|
|
} else if (authenticated) {
|
|
// show a complete list of link structure informations in case that the user is authenticated
|
|
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
|
|
final Iterator<WebStructureGraph.StructureEntry> i = sb.webStructure.structureEntryIterator(latest);
|
|
int c = 0;
|
|
WebStructureGraph.StructureEntry sentry;
|
|
while (i.hasNext()) {
|
|
sentry = i.next();
|
|
reference(prop, "out", c, sentry, sb.webStructure);
|
|
c++;
|
|
}
|
|
prop.put("out_domains", c);
|
|
prop.put("out", 1);
|
|
if (latest) sb.webStructure.joinOldNew();
|
|
} else {
|
|
// not-authenticated users show nothing
|
|
prop.put("out_domains", 0);
|
|
prop.put("out", 1);
|
|
}
|
|
prop.put("out_maxref", WebStructureGraph.maxref);
|
|
prop.put("maxhosts", WebStructureGraph.maxhosts);
|
|
|
|
// return rewrite properties
|
|
return prop;
|
|
}
|
|
|
|
public static void reference(serverObjects prop, String prefix, int c, WebStructureGraph.StructureEntry sentry, WebStructureGraph ws) {
|
|
prop.put(prefix + "_domains_" + c + "_hash", sentry.hosthash);
|
|
prop.putXML(prefix + "_domains_" + c + "_domain", sentry.hostname);
|
|
prop.put(prefix + "_domains_" + c + "_date", sentry.date);
|
|
Iterator<Map.Entry<String, Integer>> k = sentry.references.entrySet().iterator();
|
|
Map.Entry<String, Integer> refentry;
|
|
String refdom, refhash;
|
|
Integer refcount;
|
|
int d = 0;
|
|
refloop: while (k.hasNext()) {
|
|
refentry = k.next();
|
|
refhash = refentry.getKey();
|
|
refdom = ws.hostHash2hostName(refhash);
|
|
if (refdom == null) continue refloop;
|
|
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refhash", refhash);
|
|
prop.putXML(prefix + "_domains_" + c + "_citations_" + d + "_refdom", refdom);
|
|
refcount = refentry.getValue();
|
|
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
|
|
d++;
|
|
}
|
|
prop.put(prefix + "_domains_" + c + "_citations", d);
|
|
}
|
|
}
|