yacy_search_server/source/net/yacy/search/schema/HyperlinkGraph.java
Michael Peter Christen 910a496c9f
Some checks failed
CI Script to build on self-hosted server / build (push) Has been cancelled
replaced http links with https
2024-07-21 18:02:58 +02:00

235 lines
10 KiB
Java

/**
* HyperlinkGraph
* Copyright 2014 by Michael Peter Christen
* First released 08.04.2014 at https://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.schema;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.index.Segment;
import org.apache.solr.common.SolrDocument;
public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
public final static Set<String> ROOTFNS = new HashSet<String>();
static {
for (String s: new String[]{"/", "/index.htm", "/index.html", "/index.php", "/home.htm", "/home.html", "/home.php", "/default.htm", "/default.html", "/default.php"}) {
ROOTFNS.add(s);
}
}
HyperlinkEdges edges;
String hostname;
public HyperlinkGraph() {
this.edges = new HyperlinkEdges();
this.hostname = null;
}
public void fill(final SolrConnector solrConnector, String hostname, final MultiProtocolURL stopURL, final long maxtime, final int maxnodes) {
this.hostname = hostname;
if (hostname.startsWith("www.")) hostname = hostname.substring(4);
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
final int pageSize = 100;
final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
final List<String> queries = new ArrayList<>();
queries.add(q.toString());
final Thread solrQueryTask = new Thread(solrConnector.newDocumentsByQueriesTask(docs, queries, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, pageSize, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
));
solrQueryTask.start();
SolrDocument doc;
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
HyperlinkEdges inboundEdges = new HyperlinkEdges();
HyperlinkEdges outboundEdges = new HyperlinkEdges();
HyperlinkEdges errorEdges = new HyperlinkEdges();
try {
retrieval: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
MultiProtocolURL from;
try {
from = new MultiProtocolURL(u);
} catch (final MalformedURLException e1) {
continue;
}
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (error != null) {
errorDocs.put(u, error);
} else {
Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
String link;
while (links.hasNext()) {
link = links.next();
try {
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Inbound);
inboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (final MalformedURLException e) {
/* Continue on the next link */
}
}
links = URIMetadataNode.getLinks(doc, false); // outbound
while (links.hasNext()) {
link = links.next();
try {
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Outbound);
outboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (final MalformedURLException e) {
/* Continue on the next link */
}
}
}
if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
break retrieval;
}
}
} catch (final InterruptedException e) {
Thread.currentThread().interrupt(); // preserve interrupted thread state
} finally {
/* Ensure termination and proper resources release of the query thread */
solrQueryTask.interrupt();
}
if(!Thread.currentThread().isInterrupted()) {
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<HyperlinkEdge> i = inboundEdges.iterator();
HyperlinkEdge edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
i = outboundEdges.iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this.edges.addAll(outboundEdges);
this.edges.addAll(inboundEdges);
this.edges.addAll(errorEdges);
}
}
public void path(final Segment segment, DigestURL from, DigestURL to, final int maxtime, final int maxnodes) {
// two steps to find the graph: (1) create a HyperlinkGraph (to-down) and (2) backtrack backlinks up to an element of the graph (bottom-up)
if (this.edges.size() == 0) {
fill(segment.fulltext().getDefaultConnector(), from == null ? to.getHost() : from.getHost(), to, maxtime, maxnodes);
}
if (getDepth(to) >= 0 && (from == null || getDepth(from) >= 0)) return; // nothing to do.
// now find the link bottom-up
}
public int findLinkDepth() {
int remaining = this.edges.size();
// first find root nodes
Set<MultiProtocolURL> nodes = new HashSet<MultiProtocolURL>();
Set<MultiProtocolURL> nextnodes = new HashSet<MultiProtocolURL>();
for (HyperlinkEdge edge: this.edges) {
String path = edge.source.getPath();
if (ROOTFNS.contains(path)) {
this.edges.updateDepth(edge.source, 0);
if (edge.target.type == HyperlinkType.Inbound) this.edges.updateDepth(edge.target, 1);
nodes.add(edge.source);
nextnodes.add(edge.target);
remaining--;
}
}
if (nodes.size() == 0 && this.edges.size() > 0) {
ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
}
// add virtual nodes
for (String rootpath: ROOTFNS) {
try {
this.edges.updateDepth(new DigestURL("http://" + hostname + rootpath), 0);
} catch (MalformedURLException e) {}
}
// recursively step into depth and find next level
int depth = 1;
while (remaining > 0) {
boolean found = false;
nodes = nextnodes;
nextnodes = new HashSet<MultiProtocolURL>();
for (HyperlinkEdge edge: this.edges) {
if (nodes.contains(edge.source)) {
this.edges.updateDepth(edge.source, depth);
if (edge.target.type == HyperlinkType.Inbound) this.edges.updateDepth(edge.target, depth + 1);
nextnodes.add(edge.target);
remaining--;
found = true;
}
}
depth++;
if (!found) break; // terminating in case that not all edges are linked together
}
if (remaining > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find all edges for " + hostname + ", " + remaining + " remaining.");
return depth;
}
public Integer getDepth(MultiProtocolURL url) {
return this.edges.getDepth(url);
}
@Override
public Iterator<HyperlinkEdge> iterator() {
return this.edges.iterator();
}
}