Merge origin/master into jetty

This commit is contained in:
reger 2013-12-05 22:53:29 +01:00
commit 92d9c56f9f
32 changed files with 380 additions and 335 deletions

View File

@ -25,7 +25,7 @@
<key>Java</key>
<dict>
<key>VMOptions</key>
<string>-Xmx600m -Xms180m -Xss256k -XX:MaxPermSize=256m -XX:-UseGCOverheadLimit -XX:+UseAdaptiveSizePolicy -Djava.net.preferIPv4Stack=true -Dfile.encoding=UTF-8</string>
<string>-Xmx600m -Xms90m -Dsolr.directoryFactory=solr.MMapDirectoryFactory -Dfile.encoding=UTF-8</string>
<key>WorkingDirectory</key>
<string>$APP_PACKAGE/Contents/Resources/Java</string>
<key>MainClass</key>

View File

@ -3,7 +3,7 @@ javacSource=1.6
javacTarget=1.6
# Release Configuration
releaseVersion=1.65
releaseVersion=1.66
stdReleaseFile=yacy${branch}_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

View File

@ -437,13 +437,13 @@ host_extent_i
## citation ranking
## the number of documents within a single host
cr_host_count_i
#cr_host_count_i
## the chance to click on this page when randomly clicking on links within on one host
cr_host_chance_d
#cr_host_chance_d
## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10
cr_host_norm_i
#cr_host_norm_i
## custom rating; to be set with external rating information
rating_i

View File

@ -75,7 +75,7 @@ source_id_s
#source_clickdepth_i
## copy of the citation rank norm value from the source link
source_cr_host_norm_i
#source_cr_host_norm_i
## host of the url (source)
@ -176,7 +176,7 @@ target_path_folders_sxt
#target_clickdepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
target_cr_host_norm_i
#target_cr_host_norm_i
## host of the url (target)

View File

@ -79,7 +79,7 @@ public class ConfigHTCache_p {
prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT));
prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024);
prop.put("actualCacheDocCount", Cache.getActualCacheDocCount());
prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
prop.put("docSizeAverage", Cache.getActualCacheDocCount() == 0 ? 0 : Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64));
// return rewrite properties
return prop;

View File

@ -54,7 +54,7 @@
</dd>
<dt>Time-Out</dt>
<dd>
<input type="text" name="timeout" value ="500" size="4"/> ms
<input type="text" name="timeout" value ="2000" size="4"/> ms
</dd>
<dt>Scan Cache</dt>
<dd>

View File

@ -81,8 +81,8 @@ public class Crawler_p {
prop.putNum("urlpublictextSize", fulltext.collectionSize());
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
prop.put("webgraphSolrURL", fulltext.connectedLocalSolr() ? localSolr.replace("collection1", "webgraph") : remoteSolr + "webgraph/select?&q=*:*&start=0&rows=3");
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("citationSize", segment.citationCount());
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
prop.putNum("rwipublictextSize", segment.RWICount());

View File

@ -87,7 +87,7 @@ public class IndexControlURLs_p {
prop.put("cleanup", post == null ? 1 : 0);
prop.put("cleanup_solr", segment.fulltext().connectedRemoteSolr() ? 1 : 0);
prop.put("cleanup_rwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0);
prop.put("cleanup_citation", segment.urlCitation() != null && !segment.urlCitation().isEmpty() ? 1 : 0);
prop.put("cleanup_citation", segment.connectedCitation() && !segment.urlCitation().isEmpty() ? 1 : 0);
// show export messages
final Fulltext.Export export = segment.fulltext().export();
@ -159,7 +159,7 @@ public class IndexControlURLs_p {
if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {}
}
if ( post.get("deleteCitation", "").equals("on")) {
if (segment.urlCitation() != null) try {segment.urlCitation().clear();} catch (final IOException e) {}
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
}
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear();

View File

@ -70,7 +70,7 @@ public class IndexFederated_p {
sb.index.connectCitation(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) { ConcurrentLog.logException(e); } // switch on
boolean webgraph = post.getBoolean(SwitchboardConstants.CORE_SERVICE_WEBGRAPH);
sb.index.fulltext().writeWebgraph(webgraph);
sb.index.fulltext().setUseWebgraph(webgraph);
env.setConfig(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, webgraph);
}

View File

@ -44,13 +44,17 @@ public class PerformanceGraph {
final int width = post.getInt("width", 660);
final int height = post.getInt("height", 240);
final boolean showMemory = !post.containsKey("nomem");
final boolean showPeers = !post.containsKey("nopeers");
long t = System.currentTimeMillis();
if (t - indexSizeTime > 10000) {
indeSizeCache = sb.index.fulltext().collectionSize();
indexSizeTime = t;
}
RasterPlotter graph = ProfilingGraph.performanceGraph(width, height, indeSizeCache + " URLS / " + sb.index.RWICount() + " WORDS IN INDEX / " + sb.index.RWIBufferCount() + " WORDS IN CACHE", showMemory);
RasterPlotter graph = ProfilingGraph.performanceGraph(
width, height,
indeSizeCache + " URLS / " + sb.index.RWICount() + " WORDS IN INDEX / " + sb.index.RWIBufferCount() + " WORDS IN CACHE",
showMemory, showPeers);
return graph;
}

View File

@ -9,7 +9,7 @@
<script type="text/javascript"><!--
function reloadGraph() {
if(document.forms["optionreloadGraph"].option.checked)
document.images["graph"].src="PerformanceGraph.png?time="+(new Date()).getTime();
document.images["graph"].src="PerformanceGraph.png?nopeers=&time="+(new Date()).getTime();
window.status="";
}
window.setInterval("reloadGraph()", 1000);
@ -18,8 +18,8 @@
#%env/templates/submenuComputation.template%#
<h2>Performance Settings for Memory</h2>
<p><img src="PerformanceGraph.png" id="graph" alt="PerformanceGraph"/></p>
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload"/> <label for="autoreload">refresh graph</label></p></form>
<p><img src="PerformanceGraph.png?nopeers=" id="graph" alt="PerformanceGraph"/></p>
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload" #(autoreload.checked)#::checked="checked"#(/autoreload.checked)#/> <label for="autoreload">refresh graph</label></p></form>
<form action="PerformanceMemory_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<p><input type="checkbox" name="simulatedshortmemory" onclick = 'this.form.submit()' #(simulatedshortmemory.checked)#:: checked="checked"#(/simulatedshortmemory.checked)#/>simulate short memory status</label></p>
<p><input type="checkbox" name="useStandardmemoryStrategy" onclick = 'this.form.submit()' #(useStandardmemoryStrategy.checked)#:: checked="checked"#(/useStandardmemoryStrategy.checked)#/>use Standard Memory Strategy (current: #[memoryStrategy]#)</p>

View File

@ -49,7 +49,7 @@ public class PerformanceMemory_p {
private static final long KB = 1024;
private static final long MB = 1024 * KB;
private static Map<String, String> defaultSettings = null;
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final serverObjects prop = new serverObjects();
@ -58,18 +58,22 @@ public class PerformanceMemory_p {
}
prop.put("gc", "0");
prop.put("autoreload.checked", "0");
if (post != null) {
if (post.containsKey("gc")) {
System.gc();
prop.put("gc", "1");
prop.put("autoreload.checked", "1");
} else {
MemoryControl.setSimulatedShortStatus(post.containsKey("simulatedshortmemory"));
boolean simulatedshortmemory = post.containsKey("simulatedshortmemory");
MemoryControl.setSimulatedShortStatus(simulatedshortmemory);
if (simulatedshortmemory) prop.put("autoreload.checked", "1");
final boolean std = post.containsKey("useStandardmemoryStrategy");
env.setConfig("memory.standardStrategy", std);
MemoryControl.setStandardStrategy(std);
}
}
prop.put("simulatedshortmemory.checked", MemoryControl.getSimulatedShortStatus() ? 1 : 0);
prop.put("useStandardmemoryStrategy.checked", env.getConfigBool("memory.standardStrategy", true) ? 1 : 0);
prop.put("memoryStrategy", MemoryControl.getStrategyName());

View File

@ -9,7 +9,7 @@
<script type="text/javascript"><!--
function reloadGraph() {
if(document.forms["optionreloadGraph"].option.checked)
document.images["graph"].src="PerformanceGraph.png?time="+(new Date()).getTime();
document.images["graph"].src="PerformanceGraph.png?nopeers=&time="+(new Date()).getTime();
window.status="";
}
window.setInterval("reloadGraph()", 1000);
@ -18,7 +18,7 @@
#%env/templates/submenuConfig.template%#
<h2>Performance Settings</h2>
<p><img src="PerformanceGraph.png" id="graph" alt="PerformanceGraph" width="660" height="240"/></p>
<p><img src="PerformanceGraph.png?nopeers=" id="graph" alt="PerformanceGraph" width="660" height="240"/></p>
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload"/> <label for="autoreload">refresh graph</label></p></form>

View File

@ -81,8 +81,8 @@ public class status_p {
// index size
prop.putNum("urlpublictextSize", fulltext.collectionSize());
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
prop.putNum("citationSize", segment.citationCount());
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
prop.putNum("rwipublictextSize", segment.RWICount());
@ -131,8 +131,8 @@ public class status_p {
prop.put("postprocessingRunning", Switchboard.postprocessingRunning ? 1 : 0);
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().writeToWebgraph());
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().writeToWebgraph();
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().useWebgraph());
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().useWebgraph();
long collectionTimeSinceStart = processCollection && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[0] : 0;
long webgraphTimeSinceStart = processWebgraph && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[1] : 0;

View File

@ -126,7 +126,7 @@ public class yacydoc {
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.url().toNormalform(true));
prop.put("yacy_size", entry.size());
prop.put("yacy_words", entry.wordCount());
prop.put("yacy_citations", sb.index.urlCitation()!= null ? sb.index.urlCitation().count(entry.hash()) : 0);
prop.put("yacy_citations", sb.index.connectedCitation() ? sb.index.urlCitation().count(entry.hash()) : 0);
prop.put("yacy_inbound", entry.llocal());
prop.put("yacy_outbound", entry.lother());

View File

@ -233,7 +233,7 @@ function resultLine(type, item, linenumber) {
if (type == "image") {
html += "<div style=\"float:left\">";
html += "<a href=\"" + item.link + "\" class=\"thumblink\" onclick=\"return hs.expand(this)\">";
html += "<img src=\"/ViewImage.png?maxwidth=96&amp;maxheight=96&amp;code=" + item.guid + "\" alt=\"" + title + "\" />";
html += "<img src=\"/ViewImage.png?maxwidth=96&amp;maxheight=96&amp;code=" + item.guid + " + &amp;url=" + item.link + "\" alt=\"" + title + "\" />";
//html += "<img src=\"" + item.link + "\" width=\"96\" height=\"96\" alt=\"" + title + "\" />";
html += "</a>";
var name = title;

View File

@ -48,15 +48,27 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public final class hello {
// example:
// http://localhost:8090/yacy/hello.html?count=1&seed=p|{Hash=sCJ6Tq8T0N9x,IPType=&empty;,Port=8090,IP=,Uptime=8,rI=190,Version=0.10004882,PeerType=junior,UTC=+0200,RCount=0,sI=0,LastSeen=20080605103333,Name=intratest,CCount=5.0,SCount=40,news=,USpeed=0,CRTCnt=0,CRWCnt=0,BDate=20080605081349,rU=190,LCount=187,dct=1212668923654,ICount=2,sU=0,ISpeed=0,RSpeed=0.0,NCount=0,Flags=oooo}
// http://localhost:8090/yacy/hello.html?count=1&seed=p|{Hash=sCJ6Tq8T0N9x,Port=8090,PeerType=junior}
// http://localhost:8090/yacy/hello.html?count=10&seed=z|H4sIAAAAAAAAADWQW2vDMAyF_81eJork3GyGX-YxGigly2WFvZTQijbQJsHx1pWx_z7nMj1J4ug7B_2s6-GsP5q3G-G6vBz2e0iz8t6zfuBr7-5PUNanQfulhqyzTkuUCFXvmitrBJtq4ed3tkPTtRpXhIiRDAmq0uhHFIiQMduJ-NXYU9NCbrrP1vnjIdUqgk09uIK51V6rMBRIilAo2NajwzfhGcx8QUKsEIp5iCJo-eaTVUXPfPQ4k5dm4pp8NzaESsLzS-14QVNIMlA-ka2m1JuZJJWIBRwPo0GIIiYp4zCSkC5GQSLiJIah0p6X_rvlS-MTbWdhkCSBIni9jA_rfP3-Ae1Oye9dAQAA
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
final long start = System.currentTimeMillis();
prop.put("message", "none");
final String clientip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "<unknown>"); // read an artificial header addendum
final InetAddress ias = Domains.dnsResolve(clientip);
long time = System.currentTimeMillis();
final long time_dnsResolve = System.currentTimeMillis() - time;
if (ias == null) {
Network.log.info("hello/server: failed contacting seed; clientip not resolvable (clientip=" + clientip + ", time_dnsResolve=" + time_dnsResolve + ")");
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
return prop;
}
prop.put("yourip", ias.getHostAddress());
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_VIRGIN); // a default value
prop.put("seedlist", "");
if ((post == null) || (env == null)) {
prop.put("message", "no post or no enviroment");
return prop;
@ -73,15 +85,6 @@ public final class hello {
int count = post.getInt("count", 0);
final long magic = post.getLong("magic", 0);
// final Date remoteTime = yacyCore.parseUniversalDate(post.get(MYTIME)); // read remote time
final String clientip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "<unknown>"); // read an artificial header addendum
long time = System.currentTimeMillis();
final InetAddress ias = Domains.dnsResolve(clientip);
final long time_dnsResolve = System.currentTimeMillis() - time;
if (ias == null) {
Network.log.info("hello/server: failed contacting seed; clientip not resolvable (clientip=" + clientip + ", time_dnsResolve=" + time_dnsResolve + ")");
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
return prop;
}
if (seed.length() > Seed.maxsize) {
Network.log.info("hello/server: rejected contacting seed; too large (" + seed.length() + " > " + Seed.maxsize + ", time_dnsResolve=" + time_dnsResolve + ")");
prop.put("message", "your seed is too long (" + seed.length() + ")");

View File

@ -26,6 +26,7 @@
import java.net.MalformedURLException;
import java.util.Iterator;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
@ -39,6 +40,7 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.Memory;
import net.yacy.crawler.data.Cache;
import net.yacy.data.URLLicense;
import net.yacy.kelondro.util.Formatter;
@ -243,7 +245,7 @@ public class yacysearchitem {
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false);
final String ext = MultiProtocolURL.getFileExtension(resultFileName).toLowerCase();
if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) {
if (MultiProtocolURL.isImage(ext)) {
final String license = URLLicense.aquireLicense(resultURL);
prop.put("content_code", license);
} else {
@ -261,8 +263,8 @@ public class yacysearchitem {
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean stealthmode = p2pmode && theSearch.query.isLocal();
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) &&
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false) && Memory.load() < 1.0)) &&
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
theSearch.query.transmitcount = item + 1;
return prop;
}

View File

@ -181,12 +181,12 @@ public class OpenSearchConnector {
if (sb == null) {
return false;
}
final SolrConnector connector = sb.index.fulltext().writeToWebgraph() ? null : sb.index.fulltext().getWebgraphConnector();
// check if needed Solr fields are available (selected)
if (connector == null) {
if (!sb.index.fulltext().useWebgraph()) {
ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index");
return false;
}
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) )
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);

View File

@ -57,7 +57,6 @@ import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist.BlacklistType;
@ -138,11 +137,14 @@ public final class CrawlStacker {
// record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
byte[] anchorhash = entry.url().hash();
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
if (urlCitationIndex != null && entry.referrerhash() != null) try {
urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
ConcurrentLog.logException(e);
if (entry.referrerhash() != null) {
if (this.indexSegment.connectedCitation()) try {
this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
} catch (final Exception e) {
ConcurrentLog.logException(e);
}
// TODO: write to webgraph??
}
try {

View File

@ -911,9 +911,6 @@ public final class Protocol {
}
String filter = event.query.urlMask.pattern().toString();
if (event.query.tld != null) filter = ".*" + event.query.tld + ".*" + filter;
if (event.query.modifier.protocol != null) filter = ".*" + event.query.modifier.protocol + ".*" + filter;
if (event.query.modifier.filetype != null) filter = filter + ".*" + event.query.modifier.filetype + ".*";
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));

View File

@ -55,7 +55,7 @@ public class ProfilingGraph {
return max;
}
public static RasterPlotter performanceGraph(final int width, final int height, final String subline, final boolean showMemory) {
public static RasterPlotter performanceGraph(final int width, final int height, final String subline, final boolean showMemory, final boolean showPeers) {
// find maximum values for automatic graph dimension adoption
final int maxppm = (int) maxPayload(EventTracker.EClass.PPM, 25);
final int maxwords = (int) maxPayload(EventTracker.EClass.WORDCACHE, 12000);
@ -171,22 +171,24 @@ public class ProfilingGraph {
}
// draw peer ping
events = EventTracker.getHistory(EventTracker.EClass.PEERPING);
x0 = 1; y0 = 0;
if (events != null) {
EventTracker.Event event;
EventPing ping;
String pingPeer;
while (events.hasNext()) {
event = events.next();
time = event.time - now;
ping = (EventPing) event.payload;
x1 = (int) (time/1000);
y1 = Math.abs((ping.outgoing ? ping.toPeer : ping.fromPeer).hashCode()) % vspace;
pingPeer = ping.outgoing ? "-> " + ping.toPeer.toUpperCase() : "<- " + ping.fromPeer.toUpperCase();
chart.setColor(Long.parseLong("444444", 16));
chart.chartDot(ChartPlotter.DIMENSION_BOTTOM, ChartPlotter.DIMENSION_ANOT2, x1, y1, 2, pingPeer + (ping.newPeers > 0 ? "(+" + ping.newPeers + ")" : ""), 0);
x0 = x1; y0 = y1;
if (showPeers) {
events = EventTracker.getHistory(EventTracker.EClass.PEERPING);
x0 = 1; y0 = 0;
if (events != null) {
EventTracker.Event event;
EventPing ping;
String pingPeer;
while (events.hasNext()) {
event = events.next();
time = event.time - now;
ping = (EventPing) event.payload;
x1 = (int) (time/1000);
y1 = Math.abs((ping.outgoing ? ping.toPeer : ping.fromPeer).hashCode()) % vspace;
pingPeer = ping.outgoing ? "-> " + ping.toPeer.toUpperCase() : "<- " + ping.fromPeer.toUpperCase();
chart.setColor(Long.parseLong("9999AA", 16));
chart.chartDot(ChartPlotter.DIMENSION_BOTTOM, ChartPlotter.DIMENSION_ANOT2, x1, y1, 2, pingPeer + (ping.newPeers > 0 ? "(+" + ping.newPeers + ")" : ""), 0);
x0 = x1; y0 = y1;
}
}
}

View File

@ -505,7 +505,7 @@ public final class Switchboard extends serverSwitch {
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);}
}
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
this.index.fulltext().connectLocalSolr();
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
}
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
// set up the solr interface
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
@ -2292,46 +2292,15 @@ public final class Switchboard extends serverSwitch {
// we optimize first because that is useful for postprocessing
int proccount = 0;
boolean allCrawlsFinished = this.crawler.allCrawlsFinished(this.crawlQueues);
if (allCrawlsFinished) {
postprocessingRunning = true;
// flush caches
Domains.clear();
this.crawlQueues.noticeURL.clear();
// do solr optimization
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
int opts = Math.max(1, (int) (fulltext.collectionSize() / 5000000));
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
if (idleAdmin > 600000) {
// only run optimization if the admin is idle (10 minutes)
if (proccount > 0) {
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
optimizeRequired = true;
}
if (optimizeRequired) {
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
fulltext.optimize(opts);
this.optimizeLastRun = System.currentTimeMillis();
}
}
}
ReferenceReportCache rrCache = index.getReferenceReportCache();
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache);
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size();
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.writeToWebgraph());
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.writeToWebgraph();
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.useWebgraph());
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.useWebgraph();
boolean allCrawlsFinished = this.crawler.allCrawlsFinished(this.crawlQueues);
if ((processCollection || processWebgraph) && (cleanupByHarvestkey > 0 || allCrawlsFinished)) {
//full optimization of webgraph, if exists
if (fulltext.writeToWebgraph()) fulltext.getWebgraphConnector().optimize(1);
if (cleanupByHarvestkey > 0) {
// run postprocessing on these profiles
postprocessingRunning = true;
@ -2371,6 +2340,34 @@ public final class Switchboard extends serverSwitch {
}
}
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
if (allCrawlsFinished) {
postprocessingRunning = true;
// flush caches
Domains.clear();
this.crawlQueues.noticeURL.clear();
// do solr optimization
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
int opts = Math.max(1, (int) (fulltext.collectionSize() / 5000000));
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
if (idleAdmin > 600000) {
// only run optimization if the admin is idle (10 minutes)
if (proccount > 0) {
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
optimizeRequired = true;
}
if (optimizeRequired) {
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
fulltext.optimize(opts);
this.optimizeLastRun = System.currentTimeMillis();
}
}
}
postprocessingStartTime = new long[]{0,0}; // the start time for the processing; not started = 0
postprocessingRunning = false;
}

View File

@ -79,7 +79,7 @@ public class DocumentIndex extends Segment {
false // exceed134217727
);
super.fulltext().connectLocalSolr();
super.fulltext().writeWebgraph(true);
super.fulltext().setUseWebgraph(true);
this.callback = callback;
this.queue = new LinkedBlockingQueue<AnchorURL>(WorkflowProcessor.availableCPU * 300);
this.worker = new Worker[WorkflowProcessor.availableCPU];

View File

@ -110,11 +110,11 @@ public final class Fulltext {
this.writeWebgraph = false;
}
public void writeWebgraph(boolean check) {
public void setUseWebgraph(boolean check) {
this.writeWebgraph = check;
}
public boolean writeToWebgraph() {
public boolean useWebgraph() {
return this.writeWebgraph;
}
@ -403,7 +403,7 @@ public final class Fulltext {
}
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
if (!this.writeToWebgraph()) return;
if (!this.useWebgraph()) return;
if (edges == null || edges.size() == 0) return;
try {
this.getWebgraphConnector().add(edges);

View File

@ -143,7 +143,7 @@ public class Segment {
new String[] {},
this,
"putDocument",
10,
30,
null,
1);
}
@ -382,9 +382,9 @@ public class Segment {
}
} catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.writeToWebgraph()) internalIDs.clear();
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
}
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.writeToWebgraph()) {
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, WebgraphSchema.source_id_s.getSolrFieldName());
@ -663,9 +663,8 @@ public class Segment {
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
}
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
// STORE TO SOLR
String error = null;
this.putDocumentInQueue(vector);
@ -673,7 +672,7 @@ public class Segment {
if (webgraph != null && webgraph.size() > 0) {
// write the edges to the webgraph solr index
if (this.fulltext.writeToWebgraph()) {
if (this.fulltext.useWebgraph()) {
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;

View File

@ -178,16 +178,13 @@ public final class QueryParams {
}
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
if (this.urlMask_isCatchall) {
if (modifier.protocol != null) {
this.urlMask = Pattern.compile(modifier.protocol + ".*");
this.urlMask_isCatchall = false;
}
if (tld != null) {
this.urlMask = Pattern.compile(".*\\." + tld + ".*");
this.urlMask_isCatchall = false;
}
if (modifier.filetype != null) {
this.urlMask = Pattern.compile(".*" + modifier.filetype + ".*");
String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol;
String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost;
String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*";
String filter = protocolfilter + "://" + hostfilter + "/" + filefilter;
if (!filter.equals(".*://.*/.*")) {
this.urlMask = Pattern.compile(filter);
this.urlMask_isCatchall = false;
}
}
@ -343,7 +340,6 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeSize() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);
@ -368,7 +364,6 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeSize() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);

View File

@ -99,8 +99,9 @@ import net.yacy.search.snippet.TextSnippet.ResultClass;
import org.apache.solr.common.SolrDocument;
public final class SearchEvent {
private static final int max_results_rwi = 3000;
private static final int max_results_node = 150;
/*
private static long noRobinsonLocalRWISearch = 0;
@ -219,7 +220,7 @@ public final class SearchEvent {
this.workTables = workTables;
this.query = query;
this.loader = loader;
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(100, false);
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(max_results_node, false);
this.maxExpectedRemoteReferences = new AtomicInteger(0);
this.expectedRemoteReferences = new AtomicInteger(0);
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
@ -391,7 +392,7 @@ public final class SearchEvent {
this.deleteIfSnippetFail = deleteIfSnippetFail;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
this.resultList = new WeakPriorityBlockingQueue<ResultEntry>(Math.max(1000, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
this.resultList = new WeakPriorityBlockingQueue<ResultEntry>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search

View File

@ -328,6 +328,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
/**
* a SolrVector is a SolrInputDocument with the ability
* to store also the webgraph that is associated with
* the web document in the Solr document.
*/
public static class SolrVector extends SolrInputDocument {
private static final long serialVersionUID = -210901881471714939L;
private List<SolrInputDocument> webgraphDocuments;
@ -891,19 +896,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
*/
public int postprocessing(final Segment segment, ReferenceReportCache rrCache, ClickdepthCache clickdepthCache, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(false); // make sure that we have latest information that can be found
if (webgraphConnector != null) webgraphConnector.commit(false);
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null;
if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false);
CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration();
WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration();
// collect hosts from index which shall take part in citation computation
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
ReversibleScoreMap<String> hostscore;
try {
// collect hosts from index which shall take part in citation computation
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
hostscore = new ClusteredScoreMap<String>();
}
// create the ranking map
Map<byte[], CRV> ranking = null;
if ((segment.fulltext().useWebgraph() &&
((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) ||
(webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) ||
(collection.contains(CollectionSchema.cr_host_count_i) &&
collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)))) try {
ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts");
ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
int countcheck = 0;
for (String host: hostscore.keyList(true)) {
// Patch the citation index for links with canonical tags.
@ -961,40 +982,42 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck);
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
hostscore = new ClusteredScoreMap<String>();
}
// process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
if (webgraphConnector != null) {
if (segment.fulltext().useWebgraph()) {
try {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
String query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host;
long count = webgraphConnector.getCountByQuery(query);
query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host;
long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query);
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph");
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100);
BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100);
int countcheck = 0;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, null);
if (webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) {
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) {
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
}
}
if (changed) try {
try {
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
webgraphConnector.add(sid);
segment.fulltext().getWebgraphConnector().add(sid);
} catch (SolrException e) {
ConcurrentLog.logException(e);
} catch (IOException e) {
@ -1012,7 +1035,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// process all documents in collection
String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
@ -1039,7 +1062,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++;
}
if (tagtype == ProcessType.CITATION) {
if (tagtype == ProcessType.CITATION &&
collection.contains(CollectionSchema.cr_host_count_i) &&
collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)) {
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);

View File

@ -120,185 +120,198 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
final String sourceName) {
boolean allAttr = this.isEmpty();
boolean generalNofollow = responseHeader == null ? false : responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
for (final AnchorURL target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization
StringBuilder idi = new StringBuilder(8);
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
while (idi.length() < 8) idi.insert(0, '0');
String source_id = ASCII.String(source.hash());
String target_id = ASCII.String(target_url.hash());
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
add(edge, WebgraphSchema.target_order_i, target_order++);
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
final String source_url_string = source.toNormalform(false);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
}
add(edge, WebgraphSchema.collection_sxt, cs);
}
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
Map<String, String> source_searchpart = source.getSearchpartMap();
if (source_searchpart == null) {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
String source_host = null;
if ((source_host = source.getHost()) != null) {
String dnc = Domains.getDNC(source_host);
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
String source_file_name = source.getFileName();
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
}
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
}
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
ImageEntry ientry = null;
for (ImageEntry ie: images) {
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
}
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
String target_host = null;
if ((target_host = target_url.getHost()) != null) {
String dnc = Domains.getDNC(target_host);
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
String target_file_name = target_url.getFileName();
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
}
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
}
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, clickdepth_source, images, inbound,
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
subgraph.edges.add(edge);
}
}
public SolrInputDocument getEdge(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final List<ImageEntry> images, final boolean inbound,
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization
StringBuilder idi = new StringBuilder(8);
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
while (idi.length() < 8) idi.insert(0, '0');
String source_id = ASCII.String(source.hash());
String target_id = ASCII.String(target_url.hash());
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
SolrInputDocument edge = new SolrInputDocument();
add(edge, WebgraphSchema.id, id.toString());
add(edge, WebgraphSchema.target_order_i, target_order);
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
final String source_url_string = source.toNormalform(false);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
}
add(edge, WebgraphSchema.collection_sxt, cs);
}
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
Map<String, String> source_searchpart = source.getSearchpartMap();
if (source_searchpart == null) {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
String source_host = null;
if ((source_host = source.getHost()) != null) {
String dnc = Domains.getDNC(source_host);
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
String source_file_name = source.getFileName();
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
}
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
}
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
ImageEntry ientry = null;
for (ImageEntry ie: images) {
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
}
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
} else {
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
}
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
String target_host = null;
if ((target_host = target_url.getHost()) != null) {
String dnc = Domains.getDNC(target_host);
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
int pp = subdomOrga.lastIndexOf('.');
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
String target_file_name = target_url.getFileName();
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
}
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(edge, CollectionSchema.harvestkey_s, sourceName);
}
}
// return the edge
return edge;
}
public int postprocessing(final Segment segment, ClickdepthCache clickdepthCache, final String harvestkey) {
if (!this.contains(WebgraphSchema.process_sxt)) return 0;
if (!segment.fulltext().writeToWebgraph()) return 0;
if (!segment.fulltext().useWebgraph()) return 0;
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
// that means we must search for those entries.
webgraphConnector.commit(true); // make sure that we have latest information that can be found
@ -323,7 +336,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH) {
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName());
urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName());
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
@ -334,7 +347,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
//ConcurrentLog.info("WebgraphConfiguration", "postprocessing webgraph source id " + id + ", url=" + protocol + "://" + urlstub + ", result: " + (changed ? "changed" : "not changed"));
}
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if (this.contains(WebgraphSchema.target_clickdepth_i) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName());
urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());

View File

@ -174,7 +174,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
}
public int referencesCount() {
// urlCitationIndex index might be null (= configuration option)
return this.indexSegment.urlCitation() != null ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
return this.indexSegment.connectedCitation() ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
}
public int llocal() {
return this.urlentry.llocal();

View File

@ -18,7 +18,7 @@ if exist DATA\SETTINGS\httpProxy.conf GoTo :RENAMEINDEX
if exist DATA\SETTINGS\yacy.conf GoTo :GETSTARTOPTS
:STARTJAVA
set javacmd=%javacmd% -XX:-UseGCOverheadLimit -Djava.net.preferIPv4Stack=true -Djava.awt.headless=true -Dfile.encoding=UTF-8
set javacmd=%javacmd% -Djava.awt.headless=true -Dsolr.directoryFactory=solr.MMapDirectoryFactory -Dfile.encoding=UTF-8
Rem Starting YaCy
Echo Generated classpath:%CLASSPATH%
Echo JRE Parameters:%javacmd%