mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge origin/master into jetty
This commit is contained in:
commit
92d9c56f9f
|
@ -25,7 +25,7 @@
|
|||
<key>Java</key>
|
||||
<dict>
|
||||
<key>VMOptions</key>
|
||||
<string>-Xmx600m -Xms180m -Xss256k -XX:MaxPermSize=256m -XX:-UseGCOverheadLimit -XX:+UseAdaptiveSizePolicy -Djava.net.preferIPv4Stack=true -Dfile.encoding=UTF-8</string>
|
||||
<string>-Xmx600m -Xms90m -Dsolr.directoryFactory=solr.MMapDirectoryFactory -Dfile.encoding=UTF-8</string>
|
||||
<key>WorkingDirectory</key>
|
||||
<string>$APP_PACKAGE/Contents/Resources/Java</string>
|
||||
<key>MainClass</key>
|
||||
|
|
|
@ -3,7 +3,7 @@ javacSource=1.6
|
|||
javacTarget=1.6
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=1.65
|
||||
releaseVersion=1.66
|
||||
stdReleaseFile=yacy${branch}_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
releaseFileParentDir=yacy
|
||||
|
|
|
@ -437,13 +437,13 @@ host_extent_i
|
|||
## citation ranking
|
||||
|
||||
## the number of documents within a single host
|
||||
cr_host_count_i
|
||||
#cr_host_count_i
|
||||
|
||||
## the chance to click on this page when randomly clicking on links within on one host
|
||||
cr_host_chance_d
|
||||
#cr_host_chance_d
|
||||
|
||||
## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10
|
||||
cr_host_norm_i
|
||||
#cr_host_norm_i
|
||||
|
||||
## custom rating; to be set with external rating information
|
||||
rating_i
|
||||
|
|
|
@ -75,7 +75,7 @@ source_id_s
|
|||
#source_clickdepth_i
|
||||
|
||||
## copy of the citation rank norm value from the source link
|
||||
source_cr_host_norm_i
|
||||
#source_cr_host_norm_i
|
||||
|
||||
|
||||
## host of the url (source)
|
||||
|
@ -176,7 +176,7 @@ target_path_folders_sxt
|
|||
#target_clickdepth_i
|
||||
|
||||
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
|
||||
target_cr_host_norm_i
|
||||
#target_cr_host_norm_i
|
||||
|
||||
|
||||
## host of the url (target)
|
||||
|
|
|
@ -79,7 +79,7 @@ public class ConfigHTCache_p {
|
|||
prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT));
|
||||
prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024);
|
||||
prop.put("actualCacheDocCount", Cache.getActualCacheDocCount());
|
||||
prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
|
||||
prop.put("docSizeAverage", Cache.getActualCacheDocCount() == 0 ? 0 : Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024);
|
||||
prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64));
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
</dd>
|
||||
<dt>Time-Out</dt>
|
||||
<dd>
|
||||
<input type="text" name="timeout" value ="500" size="4"/> ms
|
||||
<input type="text" name="timeout" value ="2000" size="4"/> ms
|
||||
</dd>
|
||||
<dt>Scan Cache</dt>
|
||||
<dd>
|
||||
|
|
|
@ -81,8 +81,8 @@ public class Crawler_p {
|
|||
prop.putNum("urlpublictextSize", fulltext.collectionSize());
|
||||
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
|
||||
prop.put("webgraphSolrURL", fulltext.connectedLocalSolr() ? localSolr.replace("collection1", "webgraph") : remoteSolr + "webgraph/select?&q=*:*&start=0&rows=3");
|
||||
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
|
||||
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
|
||||
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
|
||||
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
|
||||
prop.putNum("citationSize", segment.citationCount());
|
||||
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
|
||||
prop.putNum("rwipublictextSize", segment.RWICount());
|
||||
|
|
|
@ -87,7 +87,7 @@ public class IndexControlURLs_p {
|
|||
prop.put("cleanup", post == null ? 1 : 0);
|
||||
prop.put("cleanup_solr", segment.fulltext().connectedRemoteSolr() ? 1 : 0);
|
||||
prop.put("cleanup_rwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0);
|
||||
prop.put("cleanup_citation", segment.urlCitation() != null && !segment.urlCitation().isEmpty() ? 1 : 0);
|
||||
prop.put("cleanup_citation", segment.connectedCitation() && !segment.urlCitation().isEmpty() ? 1 : 0);
|
||||
|
||||
// show export messages
|
||||
final Fulltext.Export export = segment.fulltext().export();
|
||||
|
@ -159,7 +159,7 @@ public class IndexControlURLs_p {
|
|||
if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {}
|
||||
}
|
||||
if ( post.get("deleteCitation", "").equals("on")) {
|
||||
if (segment.urlCitation() != null) try {segment.urlCitation().clear();} catch (final IOException e) {}
|
||||
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
|
||||
}
|
||||
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
|
||||
sb.crawlQueues.clear();
|
||||
|
|
|
@ -70,7 +70,7 @@ public class IndexFederated_p {
|
|||
sb.index.connectCitation(wordCacheMaxCount, fileSizeMax);
|
||||
} catch (final IOException e) { ConcurrentLog.logException(e); } // switch on
|
||||
boolean webgraph = post.getBoolean(SwitchboardConstants.CORE_SERVICE_WEBGRAPH);
|
||||
sb.index.fulltext().writeWebgraph(webgraph);
|
||||
sb.index.fulltext().setUseWebgraph(webgraph);
|
||||
env.setConfig(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, webgraph);
|
||||
}
|
||||
|
||||
|
|
|
@ -44,13 +44,17 @@ public class PerformanceGraph {
|
|||
final int width = post.getInt("width", 660);
|
||||
final int height = post.getInt("height", 240);
|
||||
final boolean showMemory = !post.containsKey("nomem");
|
||||
final boolean showPeers = !post.containsKey("nopeers");
|
||||
|
||||
long t = System.currentTimeMillis();
|
||||
if (t - indexSizeTime > 10000) {
|
||||
indeSizeCache = sb.index.fulltext().collectionSize();
|
||||
indexSizeTime = t;
|
||||
}
|
||||
RasterPlotter graph = ProfilingGraph.performanceGraph(width, height, indeSizeCache + " URLS / " + sb.index.RWICount() + " WORDS IN INDEX / " + sb.index.RWIBufferCount() + " WORDS IN CACHE", showMemory);
|
||||
RasterPlotter graph = ProfilingGraph.performanceGraph(
|
||||
width, height,
|
||||
indeSizeCache + " URLS / " + sb.index.RWICount() + " WORDS IN INDEX / " + sb.index.RWIBufferCount() + " WORDS IN CACHE",
|
||||
showMemory, showPeers);
|
||||
return graph;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
<script type="text/javascript"><!--
|
||||
function reloadGraph() {
|
||||
if(document.forms["optionreloadGraph"].option.checked)
|
||||
document.images["graph"].src="PerformanceGraph.png?time="+(new Date()).getTime();
|
||||
document.images["graph"].src="PerformanceGraph.png?nopeers=&time="+(new Date()).getTime();
|
||||
window.status="";
|
||||
}
|
||||
window.setInterval("reloadGraph()", 1000);
|
||||
|
@ -18,8 +18,8 @@
|
|||
#%env/templates/submenuComputation.template%#
|
||||
<h2>Performance Settings for Memory</h2>
|
||||
|
||||
<p><img src="PerformanceGraph.png" id="graph" alt="PerformanceGraph"/></p>
|
||||
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload"/> <label for="autoreload">refresh graph</label></p></form>
|
||||
<p><img src="PerformanceGraph.png?nopeers=" id="graph" alt="PerformanceGraph"/></p>
|
||||
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload" #(autoreload.checked)#::checked="checked"#(/autoreload.checked)#/> <label for="autoreload">refresh graph</label></p></form>
|
||||
<form action="PerformanceMemory_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<p><input type="checkbox" name="simulatedshortmemory" onclick = 'this.form.submit()' #(simulatedshortmemory.checked)#:: checked="checked"#(/simulatedshortmemory.checked)#/>simulate short memory status</label></p>
|
||||
<p><input type="checkbox" name="useStandardmemoryStrategy" onclick = 'this.form.submit()' #(useStandardmemoryStrategy.checked)#:: checked="checked"#(/useStandardmemoryStrategy.checked)#/>use Standard Memory Strategy (current: #[memoryStrategy]#)</p>
|
||||
|
|
|
@ -49,7 +49,7 @@ public class PerformanceMemory_p {
|
|||
private static final long KB = 1024;
|
||||
private static final long MB = 1024 * KB;
|
||||
private static Map<String, String> defaultSettings = null;
|
||||
|
||||
|
||||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
// return variable that accumulates replacements
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
@ -58,18 +58,22 @@ public class PerformanceMemory_p {
|
|||
}
|
||||
|
||||
prop.put("gc", "0");
|
||||
prop.put("autoreload.checked", "0");
|
||||
if (post != null) {
|
||||
if (post.containsKey("gc")) {
|
||||
System.gc();
|
||||
prop.put("gc", "1");
|
||||
prop.put("autoreload.checked", "1");
|
||||
} else {
|
||||
MemoryControl.setSimulatedShortStatus(post.containsKey("simulatedshortmemory"));
|
||||
boolean simulatedshortmemory = post.containsKey("simulatedshortmemory");
|
||||
MemoryControl.setSimulatedShortStatus(simulatedshortmemory);
|
||||
if (simulatedshortmemory) prop.put("autoreload.checked", "1");
|
||||
final boolean std = post.containsKey("useStandardmemoryStrategy");
|
||||
env.setConfig("memory.standardStrategy", std);
|
||||
MemoryControl.setStandardStrategy(std);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
prop.put("simulatedshortmemory.checked", MemoryControl.getSimulatedShortStatus() ? 1 : 0);
|
||||
prop.put("useStandardmemoryStrategy.checked", env.getConfigBool("memory.standardStrategy", true) ? 1 : 0);
|
||||
prop.put("memoryStrategy", MemoryControl.getStrategyName());
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
<script type="text/javascript"><!--
|
||||
function reloadGraph() {
|
||||
if(document.forms["optionreloadGraph"].option.checked)
|
||||
document.images["graph"].src="PerformanceGraph.png?time="+(new Date()).getTime();
|
||||
document.images["graph"].src="PerformanceGraph.png?nopeers=&time="+(new Date()).getTime();
|
||||
window.status="";
|
||||
}
|
||||
window.setInterval("reloadGraph()", 1000);
|
||||
|
@ -18,7 +18,7 @@
|
|||
#%env/templates/submenuConfig.template%#
|
||||
<h2>Performance Settings</h2>
|
||||
|
||||
<p><img src="PerformanceGraph.png" id="graph" alt="PerformanceGraph" width="660" height="240"/></p>
|
||||
<p><img src="PerformanceGraph.png?nopeers=" id="graph" alt="PerformanceGraph" width="660" height="240"/></p>
|
||||
<form id="optionreloadGraph" action="" method="get"><p><input type="checkbox" name="option" id="autoreload"/> <label for="autoreload">refresh graph</label></p></form>
|
||||
|
||||
|
||||
|
|
|
@ -81,8 +81,8 @@ public class status_p {
|
|||
// index size
|
||||
prop.putNum("urlpublictextSize", fulltext.collectionSize());
|
||||
prop.putNum("urlpublictextSegmentCount", fulltext.getDefaultConnector().getSegmentCount());
|
||||
prop.putNum("webgraphSize", fulltext.writeToWebgraph() ? fulltext.webgraphSize() : 0);
|
||||
prop.putNum("webgraphSegmentCount", fulltext.writeToWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
|
||||
prop.putNum("webgraphSize", fulltext.useWebgraph() ? fulltext.webgraphSize() : 0);
|
||||
prop.putNum("webgraphSegmentCount", fulltext.useWebgraph() ? fulltext.getWebgraphConnector().getSegmentCount() : 0);
|
||||
prop.putNum("citationSize", segment.citationCount());
|
||||
prop.putNum("citationSegmentCount", segment.citationSegmentCount());
|
||||
prop.putNum("rwipublictextSize", segment.RWICount());
|
||||
|
@ -131,8 +131,8 @@ public class status_p {
|
|||
|
||||
prop.put("postprocessingRunning", Switchboard.postprocessingRunning ? 1 : 0);
|
||||
|
||||
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().writeToWebgraph());
|
||||
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().writeToWebgraph();
|
||||
boolean processCollection = sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.process_sxt) && (sb.index.connectedCitation() || sb.index.fulltext().useWebgraph());
|
||||
boolean processWebgraph = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.process_sxt) && sb.index.fulltext().useWebgraph();
|
||||
|
||||
long collectionTimeSinceStart = processCollection && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[0] : 0;
|
||||
long webgraphTimeSinceStart = processWebgraph && Switchboard.postprocessingRunning ? System.currentTimeMillis() - Switchboard.postprocessingStartTime[1] : 0;
|
||||
|
|
|
@ -126,7 +126,7 @@ public class yacydoc {
|
|||
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.url().toNormalform(true));
|
||||
prop.put("yacy_size", entry.size());
|
||||
prop.put("yacy_words", entry.wordCount());
|
||||
prop.put("yacy_citations", sb.index.urlCitation()!= null ? sb.index.urlCitation().count(entry.hash()) : 0);
|
||||
prop.put("yacy_citations", sb.index.connectedCitation() ? sb.index.urlCitation().count(entry.hash()) : 0);
|
||||
prop.put("yacy_inbound", entry.llocal());
|
||||
prop.put("yacy_outbound", entry.lother());
|
||||
|
||||
|
|
|
@ -233,7 +233,7 @@ function resultLine(type, item, linenumber) {
|
|||
if (type == "image") {
|
||||
html += "<div style=\"float:left\">";
|
||||
html += "<a href=\"" + item.link + "\" class=\"thumblink\" onclick=\"return hs.expand(this)\">";
|
||||
html += "<img src=\"/ViewImage.png?maxwidth=96&maxheight=96&code=" + item.guid + "\" alt=\"" + title + "\" />";
|
||||
html += "<img src=\"/ViewImage.png?maxwidth=96&maxheight=96&code=" + item.guid + " + &url=" + item.link + "\" alt=\"" + title + "\" />";
|
||||
//html += "<img src=\"" + item.link + "\" width=\"96\" height=\"96\" alt=\"" + title + "\" />";
|
||||
html += "</a>";
|
||||
var name = title;
|
||||
|
|
|
@ -48,15 +48,27 @@ import net.yacy.server.serverObjects;
|
|||
import net.yacy.server.serverSwitch;
|
||||
|
||||
public final class hello {
|
||||
|
||||
|
||||
// example:
|
||||
// http://localhost:8090/yacy/hello.html?count=1&seed=p|{Hash=sCJ6Tq8T0N9x,IPType=∅,Port=8090,IP=,Uptime=8,rI=190,Version=0.10004882,PeerType=junior,UTC=+0200,RCount=0,sI=0,LastSeen=20080605103333,Name=intratest,CCount=5.0,SCount=40,news=,USpeed=0,CRTCnt=0,CRWCnt=0,BDate=20080605081349,rU=190,LCount=187,dct=1212668923654,ICount=2,sU=0,ISpeed=0,RSpeed=0.0,NCount=0,Flags=oooo}
|
||||
// http://localhost:8090/yacy/hello.html?count=1&seed=p|{Hash=sCJ6Tq8T0N9x,Port=8090,PeerType=junior}
|
||||
// http://localhost:8090/yacy/hello.html?count=10&seed=z|H4sIAAAAAAAAADWQW2vDMAyF_81eJork3GyGX-YxGigly2WFvZTQijbQJsHx1pWx_z7nMj1J4ug7B_2s6-GsP5q3G-G6vBz2e0iz8t6zfuBr7-5PUNanQfulhqyzTkuUCFXvmitrBJtq4ed3tkPTtRpXhIiRDAmq0uhHFIiQMduJ-NXYU9NCbrrP1vnjIdUqgk09uIK51V6rMBRIilAo2NajwzfhGcx8QUKsEIp5iCJo-eaTVUXPfPQ4k5dm4pp8NzaESsLzS-14QVNIMlA-ka2m1JuZJJWIBRwPo0GIIiYp4zCSkC5GQSLiJIah0p6X_rvlS-MTbWdhkCSBIni9jA_rfP3-Ae1Oye9dAQAA
|
||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException {
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
final long start = System.currentTimeMillis();
|
||||
prop.put("message", "none");
|
||||
final String clientip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "<unknown>"); // read an artificial header addendum
|
||||
final InetAddress ias = Domains.dnsResolve(clientip);
|
||||
long time = System.currentTimeMillis();
|
||||
final long time_dnsResolve = System.currentTimeMillis() - time;
|
||||
if (ias == null) {
|
||||
Network.log.info("hello/server: failed contacting seed; clientip not resolvable (clientip=" + clientip + ", time_dnsResolve=" + time_dnsResolve + ")");
|
||||
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
|
||||
return prop;
|
||||
}
|
||||
prop.put("yourip", ias.getHostAddress());
|
||||
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_VIRGIN); // a default value
|
||||
prop.put("seedlist", "");
|
||||
if ((post == null) || (env == null)) {
|
||||
prop.put("message", "no post or no enviroment");
|
||||
return prop;
|
||||
|
@ -73,15 +85,6 @@ public final class hello {
|
|||
int count = post.getInt("count", 0);
|
||||
final long magic = post.getLong("magic", 0);
|
||||
// final Date remoteTime = yacyCore.parseUniversalDate(post.get(MYTIME)); // read remote time
|
||||
final String clientip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "<unknown>"); // read an artificial header addendum
|
||||
long time = System.currentTimeMillis();
|
||||
final InetAddress ias = Domains.dnsResolve(clientip);
|
||||
final long time_dnsResolve = System.currentTimeMillis() - time;
|
||||
if (ias == null) {
|
||||
Network.log.info("hello/server: failed contacting seed; clientip not resolvable (clientip=" + clientip + ", time_dnsResolve=" + time_dnsResolve + ")");
|
||||
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
|
||||
return prop;
|
||||
}
|
||||
if (seed.length() > Seed.maxsize) {
|
||||
Network.log.info("hello/server: rejected contacting seed; too large (" + seed.length() + " > " + Seed.maxsize + ", time_dnsResolve=" + time_dnsResolve + ")");
|
||||
prop.put("message", "your seed is too long (" + seed.length() + ")");
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.analysis.Classification;
|
||||
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
||||
|
@ -39,6 +40,7 @@ import net.yacy.cora.protocol.HeaderFramework;
|
|||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.RequestHeader.FileType;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.Memory;
|
||||
import net.yacy.crawler.data.Cache;
|
||||
import net.yacy.data.URLLicense;
|
||||
import net.yacy.kelondro.util.Formatter;
|
||||
|
@ -243,7 +245,7 @@ public class yacysearchitem {
|
|||
}
|
||||
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false);
|
||||
final String ext = MultiProtocolURL.getFileExtension(resultFileName).toLowerCase();
|
||||
if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) {
|
||||
if (MultiProtocolURL.isImage(ext)) {
|
||||
final String license = URLLicense.aquireLicense(resultURL);
|
||||
prop.put("content_code", license);
|
||||
} else {
|
||||
|
@ -261,8 +263,8 @@ public class yacysearchitem {
|
|||
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
|
||||
boolean stealthmode = p2pmode && theSearch.query.isLocal();
|
||||
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
|
||||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) &&
|
||||
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
|
||||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false) && Memory.load() < 1.0)) &&
|
||||
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
|
||||
theSearch.query.transmitcount = item + 1;
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -181,12 +181,12 @@ public class OpenSearchConnector {
|
|||
if (sb == null) {
|
||||
return false;
|
||||
}
|
||||
final SolrConnector connector = sb.index.fulltext().writeToWebgraph() ? null : sb.index.fulltext().getWebgraphConnector();
|
||||
// check if needed Solr fields are available (selected)
|
||||
if (connector == null) {
|
||||
if (!sb.index.fulltext().useWebgraph()) {
|
||||
ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index");
|
||||
return false;
|
||||
}
|
||||
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
|
||||
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
|
||||
&& ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) )
|
||||
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
|
||||
|
|
|
@ -57,7 +57,6 @@ import net.yacy.crawler.retrieval.SMBLoader;
|
|||
import net.yacy.crawler.robots.RobotsTxt;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.rwi.IndexCell;
|
||||
import net.yacy.kelondro.workflow.WorkflowProcessor;
|
||||
import net.yacy.peers.SeedDB;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
|
@ -138,11 +137,14 @@ public final class CrawlStacker {
|
|||
|
||||
// record the link graph for this request; this can be overwritten, replaced and enhanced by an index writing process in Segment.storeDocument
|
||||
byte[] anchorhash = entry.url().hash();
|
||||
IndexCell<CitationReference> urlCitationIndex = this.indexSegment.urlCitation();
|
||||
if (urlCitationIndex != null && entry.referrerhash() != null) try {
|
||||
urlCitationIndex.add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
|
||||
} catch (final Exception e) {
|
||||
ConcurrentLog.logException(e);
|
||||
if (entry.referrerhash() != null) {
|
||||
if (this.indexSegment.connectedCitation()) try {
|
||||
this.indexSegment.urlCitation().add(anchorhash, new CitationReference(entry.referrerhash(), entry.appdate().getTime()));
|
||||
} catch (final Exception e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
|
||||
// TODO: write to webgraph??
|
||||
}
|
||||
|
||||
try {
|
||||
|
|
|
@ -911,9 +911,6 @@ public final class Protocol {
|
|||
}
|
||||
|
||||
String filter = event.query.urlMask.pattern().toString();
|
||||
if (event.query.tld != null) filter = ".*" + event.query.tld + ".*" + filter;
|
||||
if (event.query.modifier.protocol != null) filter = ".*" + event.query.modifier.protocol + ".*" + filter;
|
||||
if (event.query.modifier.filetype != null) filter = filter + ".*" + event.query.modifier.filetype + ".*";
|
||||
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
|
||||
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
|
||||
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));
|
||||
|
|
|
@ -55,7 +55,7 @@ public class ProfilingGraph {
|
|||
return max;
|
||||
}
|
||||
|
||||
public static RasterPlotter performanceGraph(final int width, final int height, final String subline, final boolean showMemory) {
|
||||
public static RasterPlotter performanceGraph(final int width, final int height, final String subline, final boolean showMemory, final boolean showPeers) {
|
||||
// find maximum values for automatic graph dimension adoption
|
||||
final int maxppm = (int) maxPayload(EventTracker.EClass.PPM, 25);
|
||||
final int maxwords = (int) maxPayload(EventTracker.EClass.WORDCACHE, 12000);
|
||||
|
@ -171,22 +171,24 @@ public class ProfilingGraph {
|
|||
}
|
||||
|
||||
// draw peer ping
|
||||
events = EventTracker.getHistory(EventTracker.EClass.PEERPING);
|
||||
x0 = 1; y0 = 0;
|
||||
if (events != null) {
|
||||
EventTracker.Event event;
|
||||
EventPing ping;
|
||||
String pingPeer;
|
||||
while (events.hasNext()) {
|
||||
event = events.next();
|
||||
time = event.time - now;
|
||||
ping = (EventPing) event.payload;
|
||||
x1 = (int) (time/1000);
|
||||
y1 = Math.abs((ping.outgoing ? ping.toPeer : ping.fromPeer).hashCode()) % vspace;
|
||||
pingPeer = ping.outgoing ? "-> " + ping.toPeer.toUpperCase() : "<- " + ping.fromPeer.toUpperCase();
|
||||
chart.setColor(Long.parseLong("444444", 16));
|
||||
chart.chartDot(ChartPlotter.DIMENSION_BOTTOM, ChartPlotter.DIMENSION_ANOT2, x1, y1, 2, pingPeer + (ping.newPeers > 0 ? "(+" + ping.newPeers + ")" : ""), 0);
|
||||
x0 = x1; y0 = y1;
|
||||
if (showPeers) {
|
||||
events = EventTracker.getHistory(EventTracker.EClass.PEERPING);
|
||||
x0 = 1; y0 = 0;
|
||||
if (events != null) {
|
||||
EventTracker.Event event;
|
||||
EventPing ping;
|
||||
String pingPeer;
|
||||
while (events.hasNext()) {
|
||||
event = events.next();
|
||||
time = event.time - now;
|
||||
ping = (EventPing) event.payload;
|
||||
x1 = (int) (time/1000);
|
||||
y1 = Math.abs((ping.outgoing ? ping.toPeer : ping.fromPeer).hashCode()) % vspace;
|
||||
pingPeer = ping.outgoing ? "-> " + ping.toPeer.toUpperCase() : "<- " + ping.fromPeer.toUpperCase();
|
||||
chart.setColor(Long.parseLong("9999AA", 16));
|
||||
chart.chartDot(ChartPlotter.DIMENSION_BOTTOM, ChartPlotter.DIMENSION_ANOT2, x1, y1, 2, pingPeer + (ping.newPeers > 0 ? "(+" + ping.newPeers + ")" : ""), 0);
|
||||
x0 = x1; y0 = y1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -505,7 +505,7 @@ public final class Switchboard extends serverSwitch {
|
|||
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
|
||||
try {this.index.fulltext().connectLocalSolr();} catch (final IOException e) {ConcurrentLog.logException(e);}
|
||||
}
|
||||
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
|
||||
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
|
||||
|
||||
// set up the solr interface
|
||||
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
|
||||
|
@ -1328,7 +1328,7 @@ public final class Switchboard extends serverSwitch {
|
|||
this.index.fulltext().connectLocalSolr();
|
||||
this.index.connectUrlDb(this.useTailCache, this.exceed134217727);
|
||||
}
|
||||
this.index.fulltext().writeWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
|
||||
this.index.fulltext().setUseWebgraph(this.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false));
|
||||
|
||||
// set up the solr interface
|
||||
final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr");
|
||||
|
@ -2292,46 +2292,15 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
// we optimize first because that is useful for postprocessing
|
||||
int proccount = 0;
|
||||
boolean allCrawlsFinished = this.crawler.allCrawlsFinished(this.crawlQueues);
|
||||
if (allCrawlsFinished) {
|
||||
postprocessingRunning = true;
|
||||
// flush caches
|
||||
Domains.clear();
|
||||
this.crawlQueues.noticeURL.clear();
|
||||
|
||||
// do solr optimization
|
||||
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
|
||||
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
|
||||
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
|
||||
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
|
||||
int opts = Math.max(1, (int) (fulltext.collectionSize() / 5000000));
|
||||
|
||||
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
|
||||
if (idleAdmin > 600000) {
|
||||
// only run optimization if the admin is idle (10 minutes)
|
||||
if (proccount > 0) {
|
||||
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
|
||||
optimizeRequired = true;
|
||||
}
|
||||
if (optimizeRequired) {
|
||||
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
|
||||
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
|
||||
fulltext.optimize(opts);
|
||||
this.optimizeLastRun = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ReferenceReportCache rrCache = index.getReferenceReportCache();
|
||||
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache);
|
||||
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
|
||||
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
|
||||
int cleanupByHarvestkey = deletionCandidates.size();
|
||||
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.writeToWebgraph());
|
||||
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.writeToWebgraph();
|
||||
boolean processCollection = collection1Configuration.contains(CollectionSchema.process_sxt) && (index.connectedCitation() || fulltext.useWebgraph());
|
||||
boolean processWebgraph = webgraphConfiguration.contains(WebgraphSchema.process_sxt) && fulltext.useWebgraph();
|
||||
boolean allCrawlsFinished = this.crawler.allCrawlsFinished(this.crawlQueues);
|
||||
if ((processCollection || processWebgraph) && (cleanupByHarvestkey > 0 || allCrawlsFinished)) {
|
||||
//full optimization of webgraph, if exists
|
||||
if (fulltext.writeToWebgraph()) fulltext.getWebgraphConnector().optimize(1);
|
||||
if (cleanupByHarvestkey > 0) {
|
||||
// run postprocessing on these profiles
|
||||
postprocessingRunning = true;
|
||||
|
@ -2371,6 +2340,34 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
}
|
||||
this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring
|
||||
if (allCrawlsFinished) {
|
||||
postprocessingRunning = true;
|
||||
// flush caches
|
||||
Domains.clear();
|
||||
this.crawlQueues.noticeURL.clear();
|
||||
|
||||
// do solr optimization
|
||||
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
|
||||
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
|
||||
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
|
||||
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
|
||||
int opts = Math.max(1, (int) (fulltext.collectionSize() / 5000000));
|
||||
|
||||
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
|
||||
if (idleAdmin > 600000) {
|
||||
// only run optimization if the admin is idle (10 minutes)
|
||||
if (proccount > 0) {
|
||||
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
|
||||
optimizeRequired = true;
|
||||
}
|
||||
if (optimizeRequired) {
|
||||
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
|
||||
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
|
||||
fulltext.optimize(opts);
|
||||
this.optimizeLastRun = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
}
|
||||
postprocessingStartTime = new long[]{0,0}; // the start time for the processing; not started = 0
|
||||
postprocessingRunning = false;
|
||||
}
|
||||
|
|
|
@ -79,7 +79,7 @@ public class DocumentIndex extends Segment {
|
|||
false // exceed134217727
|
||||
);
|
||||
super.fulltext().connectLocalSolr();
|
||||
super.fulltext().writeWebgraph(true);
|
||||
super.fulltext().setUseWebgraph(true);
|
||||
this.callback = callback;
|
||||
this.queue = new LinkedBlockingQueue<AnchorURL>(WorkflowProcessor.availableCPU * 300);
|
||||
this.worker = new Worker[WorkflowProcessor.availableCPU];
|
||||
|
|
|
@ -110,11 +110,11 @@ public final class Fulltext {
|
|||
this.writeWebgraph = false;
|
||||
}
|
||||
|
||||
public void writeWebgraph(boolean check) {
|
||||
public void setUseWebgraph(boolean check) {
|
||||
this.writeWebgraph = check;
|
||||
}
|
||||
|
||||
public boolean writeToWebgraph() {
|
||||
public boolean useWebgraph() {
|
||||
return this.writeWebgraph;
|
||||
}
|
||||
|
||||
|
@ -403,7 +403,7 @@ public final class Fulltext {
|
|||
}
|
||||
|
||||
public void putEdges(final Collection<SolrInputDocument> edges) throws IOException {
|
||||
if (!this.writeToWebgraph()) return;
|
||||
if (!this.useWebgraph()) return;
|
||||
if (edges == null || edges.size() == 0) return;
|
||||
try {
|
||||
this.getWebgraphConnector().add(edges);
|
||||
|
|
|
@ -143,7 +143,7 @@ public class Segment {
|
|||
new String[] {},
|
||||
this,
|
||||
"putDocument",
|
||||
10,
|
||||
30,
|
||||
null,
|
||||
1);
|
||||
}
|
||||
|
@ -382,9 +382,9 @@ public class Segment {
|
|||
}
|
||||
} catch (SpaceExceededException e) {
|
||||
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
|
||||
if (Segment.this.fulltext.writeToWebgraph()) internalIDs.clear();
|
||||
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear();
|
||||
}
|
||||
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.writeToWebgraph()) {
|
||||
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
|
||||
// reqd the references from the webgraph
|
||||
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
|
||||
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, WebgraphSchema.source_id_s.getSolrFieldName());
|
||||
|
@ -663,9 +663,8 @@ public class Segment {
|
|||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
|
||||
|
||||
// ENRICH DOCUMENT WITH RANKING INFORMATION
|
||||
if (this.connectedCitation()) {
|
||||
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
|
||||
}
|
||||
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
|
||||
|
||||
// STORE TO SOLR
|
||||
String error = null;
|
||||
this.putDocumentInQueue(vector);
|
||||
|
@ -673,7 +672,7 @@ public class Segment {
|
|||
if (webgraph != null && webgraph.size() > 0) {
|
||||
|
||||
// write the edges to the webgraph solr index
|
||||
if (this.fulltext.writeToWebgraph()) {
|
||||
if (this.fulltext.useWebgraph()) {
|
||||
tryloop: for (int i = 0; i < 20; i++) {
|
||||
try {
|
||||
error = null;
|
||||
|
|
|
@ -178,16 +178,13 @@ public final class QueryParams {
|
|||
}
|
||||
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
|
||||
if (this.urlMask_isCatchall) {
|
||||
if (modifier.protocol != null) {
|
||||
this.urlMask = Pattern.compile(modifier.protocol + ".*");
|
||||
this.urlMask_isCatchall = false;
|
||||
}
|
||||
if (tld != null) {
|
||||
this.urlMask = Pattern.compile(".*\\." + tld + ".*");
|
||||
this.urlMask_isCatchall = false;
|
||||
}
|
||||
if (modifier.filetype != null) {
|
||||
this.urlMask = Pattern.compile(".*" + modifier.filetype + ".*");
|
||||
String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
|
||||
String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol;
|
||||
String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost;
|
||||
String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*";
|
||||
String filter = protocolfilter + "://" + hostfilter + "/" + filefilter;
|
||||
if (!filter.equals(".*://.*/.*")) {
|
||||
this.urlMask = Pattern.compile(filter);
|
||||
this.urlMask_isCatchall = false;
|
||||
}
|
||||
}
|
||||
|
@ -343,7 +340,6 @@ public final class QueryParams {
|
|||
if (!getFacets) this.cachedQuery.setFacet(false);
|
||||
return this.cachedQuery;
|
||||
}
|
||||
if (this.queryGoal.getIncludeSize() == 0) return null;
|
||||
|
||||
// construct query
|
||||
final SolrQuery params = getBasicParams(getFacets);
|
||||
|
@ -368,7 +364,6 @@ public final class QueryParams {
|
|||
if (!getFacets) this.cachedQuery.setFacet(false);
|
||||
return this.cachedQuery;
|
||||
}
|
||||
if (this.queryGoal.getIncludeSize() == 0) return null;
|
||||
|
||||
// construct query
|
||||
final SolrQuery params = getBasicParams(getFacets);
|
||||
|
|
|
@ -99,8 +99,9 @@ import net.yacy.search.snippet.TextSnippet.ResultClass;
|
|||
import org.apache.solr.common.SolrDocument;
|
||||
|
||||
public final class SearchEvent {
|
||||
|
||||
|
||||
private static final int max_results_rwi = 3000;
|
||||
private static final int max_results_node = 150;
|
||||
|
||||
/*
|
||||
private static long noRobinsonLocalRWISearch = 0;
|
||||
|
@ -219,7 +220,7 @@ public final class SearchEvent {
|
|||
this.workTables = workTables;
|
||||
this.query = query;
|
||||
this.loader = loader;
|
||||
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(100, false);
|
||||
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(max_results_node, false);
|
||||
this.maxExpectedRemoteReferences = new AtomicInteger(0);
|
||||
this.expectedRemoteReferences = new AtomicInteger(0);
|
||||
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
|
||||
|
@ -391,7 +392,7 @@ public final class SearchEvent {
|
|||
this.deleteIfSnippetFail = deleteIfSnippetFail;
|
||||
this.urlRetrievalAllTime = 0;
|
||||
this.snippetComputationAllTime = 0;
|
||||
this.resultList = new WeakPriorityBlockingQueue<ResultEntry>(Math.max(1000, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
|
||||
this.resultList = new WeakPriorityBlockingQueue<ResultEntry>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
|
||||
|
||||
// snippets do not need to match with the complete query hashes,
|
||||
// only with the query minus the stopwords which had not been used for the search
|
||||
|
|
|
@ -328,6 +328,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
|
||||
}
|
||||
|
||||
/**
|
||||
* a SolrVector is a SolrInputDocument with the ability
|
||||
* to store also the webgraph that is associated with
|
||||
* the web document in the Solr document.
|
||||
*/
|
||||
public static class SolrVector extends SolrInputDocument {
|
||||
private static final long serialVersionUID = -210901881471714939L;
|
||||
private List<SolrInputDocument> webgraphDocuments;
|
||||
|
@ -891,19 +896,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
*/
|
||||
public int postprocessing(final Segment segment, ReferenceReportCache rrCache, ClickdepthCache clickdepthCache, String harvestkey) {
|
||||
if (!this.contains(CollectionSchema.process_sxt)) return 0;
|
||||
if (!segment.connectedCitation() && !segment.fulltext().writeToWebgraph()) return 0;
|
||||
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
|
||||
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
|
||||
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
|
||||
collectionConnector.commit(false); // make sure that we have latest information that can be found
|
||||
if (webgraphConnector != null) webgraphConnector.commit(false);
|
||||
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
|
||||
ReversibleScoreMap<String> hostscore = null;
|
||||
if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false);
|
||||
CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration();
|
||||
WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration();
|
||||
|
||||
|
||||
// collect hosts from index which shall take part in citation computation
|
||||
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
|
||||
ReversibleScoreMap<String> hostscore;
|
||||
try {
|
||||
// collect hosts from index which shall take part in citation computation
|
||||
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
|
||||
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
|
||||
} catch (final IOException e2) {
|
||||
ConcurrentLog.logException(e2);
|
||||
hostscore = new ClusteredScoreMap<String>();
|
||||
}
|
||||
|
||||
// create the ranking map
|
||||
Map<byte[], CRV> ranking = null;
|
||||
if ((segment.fulltext().useWebgraph() &&
|
||||
((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) ||
|
||||
(webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) ||
|
||||
(collection.contains(CollectionSchema.cr_host_count_i) &&
|
||||
collection.contains(CollectionSchema.cr_host_chance_d) &&
|
||||
collection.contains(CollectionSchema.cr_host_norm_i)))) try {
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts");
|
||||
ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
|
||||
int countcheck = 0;
|
||||
for (String host: hostscore.keyList(true)) {
|
||||
// Patch the citation index for links with canonical tags.
|
||||
|
@ -961,40 +982,42 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
if (hostscore.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + hostscore.size() + ", counted=" + countcheck);
|
||||
} catch (final IOException e2) {
|
||||
ConcurrentLog.logException(e2);
|
||||
hostscore = new ClusteredScoreMap<String>();
|
||||
}
|
||||
|
||||
// process all documents at the webgraph for the outgoing links of this document
|
||||
SolrDocument doc;
|
||||
if (webgraphConnector != null) {
|
||||
if (segment.fulltext().useWebgraph()) {
|
||||
try {
|
||||
for (String host: hostscore.keyList(true)) {
|
||||
if (hostscore.get(host) <= 0) continue;
|
||||
// select all webgraph edges and modify their cr value
|
||||
String query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host;
|
||||
long count = webgraphConnector.getCountByQuery(query);
|
||||
query = "{!raw f=" + WebgraphSchema.source_host_s.getSolrFieldName() + "}" + host;
|
||||
long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query);
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph");
|
||||
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100);
|
||||
BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100);
|
||||
int countcheck = 0;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
boolean changed = false;
|
||||
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
|
||||
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
|
||||
CRV crv = ranking.get(id);
|
||||
if (crv != null) {
|
||||
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
|
||||
changed = true;
|
||||
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, null);
|
||||
if (webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) {
|
||||
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
|
||||
CRV crv = ranking.get(id);
|
||||
if (crv != null) {
|
||||
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
|
||||
}
|
||||
}
|
||||
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
|
||||
crv = ranking.get(id);
|
||||
if (crv != null) {
|
||||
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
|
||||
changed = true;
|
||||
if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) {
|
||||
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
|
||||
CRV crv = ranking.get(id);
|
||||
if (crv != null) {
|
||||
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
|
||||
}
|
||||
}
|
||||
if (changed) try {
|
||||
try {
|
||||
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
|
||||
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
|
||||
webgraphConnector.add(sid);
|
||||
segment.fulltext().getWebgraphConnector().add(sid);
|
||||
} catch (SolrException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
} catch (IOException e) {
|
||||
|
@ -1012,7 +1035,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
|
||||
// process all documents in collection
|
||||
String query = (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
|
||||
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
|
||||
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
|
||||
|
@ -1039,7 +1062,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++;
|
||||
}
|
||||
|
||||
if (tagtype == ProcessType.CITATION) {
|
||||
if (tagtype == ProcessType.CITATION &&
|
||||
collection.contains(CollectionSchema.cr_host_count_i) &&
|
||||
collection.contains(CollectionSchema.cr_host_chance_d) &&
|
||||
collection.contains(CollectionSchema.cr_host_norm_i)) {
|
||||
CRV crv = ranking.get(id);
|
||||
if (crv != null) {
|
||||
sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
|
||||
|
|
|
@ -120,185 +120,198 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
final List<ImageEntry> images, final boolean inbound, final Collection<AnchorURL> links,
|
||||
final String sourceName) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
boolean generalNofollow = responseHeader == null ? false : responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
|
||||
int target_order = 0;
|
||||
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
|
||||
for (final AnchorURL target_url: links) {
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
|
||||
final String name = target_url.getNameProperty(); // the name attribute
|
||||
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
|
||||
String rel = target_url.getRelProperty(); // the rel-attribute
|
||||
int ioidx = inbound ? 0 : 1;
|
||||
if (generalNofollow) {
|
||||
// patch the rel attribute since the header makes nofollow valid for all links
|
||||
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
|
||||
}
|
||||
|
||||
// index organization
|
||||
StringBuilder idi = new StringBuilder(8);
|
||||
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
|
||||
while (idi.length() < 8) idi.insert(0, '0');
|
||||
String source_id = ASCII.String(source.hash());
|
||||
String target_id = ASCII.String(target_url.hash());
|
||||
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
|
||||
SolrInputDocument edge = new SolrInputDocument();
|
||||
add(edge, WebgraphSchema.id, id.toString());
|
||||
add(edge, WebgraphSchema.target_order_i, target_order++);
|
||||
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
|
||||
Date loadDate = new Date();
|
||||
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
|
||||
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
|
||||
add(edge, WebgraphSchema.load_date_dt, loadDate);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
|
||||
final String source_url_string = source.toNormalform(false);
|
||||
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
|
||||
List<String> cs = new ArrayList<String>();
|
||||
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
|
||||
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
|
||||
}
|
||||
add(edge, WebgraphSchema.collection_sxt, cs);
|
||||
}
|
||||
|
||||
// add the source attributes
|
||||
add(edge, WebgraphSchema.source_id_s, source_id);
|
||||
int pr_source = source_url_string.indexOf("://",0);
|
||||
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
|
||||
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
|
||||
Map<String, String> source_searchpart = source.getSearchpartMap();
|
||||
if (source_searchpart == null) {
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
|
||||
} else {
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
|
||||
String source_host = null;
|
||||
if ((source_host = source.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(source_host);
|
||||
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
|
||||
int pp = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
|
||||
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
|
||||
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
|
||||
String source_file_name = source.getFileName();
|
||||
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
|
||||
String[] paths = source.getPaths();
|
||||
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
|
||||
}
|
||||
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
|
||||
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
|
||||
}
|
||||
|
||||
// add the source attributes about the target
|
||||
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
|
||||
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
|
||||
|
||||
ImageEntry ientry = null;
|
||||
for (ImageEntry ie: images) {
|
||||
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
|
||||
}
|
||||
String alttext = ientry == null ? "" : ientry.alt();
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
|
||||
|
||||
// add the target attributes
|
||||
add(edge, WebgraphSchema.target_id_s, target_id);
|
||||
final String target_url_string = target_url.toNormalform(false);
|
||||
int pr_target = target_url_string.indexOf("://",0);
|
||||
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
|
||||
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
|
||||
subgraph.urlAnchorTexts[ioidx].add(text);
|
||||
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
|
||||
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
|
||||
Map<String, String> target_searchpart = target_url.getSearchpartMap();
|
||||
if (target_searchpart == null) {
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
|
||||
} else {
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
|
||||
String target_host = null;
|
||||
if ((target_host = target_url.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(target_host);
|
||||
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
|
||||
int pp = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
|
||||
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
|
||||
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
|
||||
String target_file_name = target_url.getFileName();
|
||||
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
|
||||
String[] paths = target_url.getPaths();
|
||||
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
|
||||
}
|
||||
|
||||
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
|
||||
if (target_url.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 0);
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 999);
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allAttr || contains(WebgraphSchema.process_sxt)) {
|
||||
List<String> pr = new ArrayList<String>();
|
||||
for (ProcessType t: processTypes) pr.add(t.name());
|
||||
add(edge, WebgraphSchema.process_sxt, pr);
|
||||
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
|
||||
add(edge, CollectionSchema.harvestkey_s, sourceName);
|
||||
}
|
||||
}
|
||||
|
||||
SolrInputDocument edge = getEdge(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source, images, inbound,
|
||||
sourceName, allAttr, generalNofollow, target_order, target_url);
|
||||
target_order++;
|
||||
// add the edge to the subgraph
|
||||
subgraph.edges.add(edge);
|
||||
}
|
||||
}
|
||||
|
||||
public SolrInputDocument getEdge(
|
||||
final Subgraph subgraph,
|
||||
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
|
||||
final List<ImageEntry> images, final boolean inbound,
|
||||
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
final String name = target_url.getNameProperty(); // the name attribute
|
||||
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
|
||||
String rel = target_url.getRelProperty(); // the rel-attribute
|
||||
int ioidx = inbound ? 0 : 1;
|
||||
if (generalNofollow) {
|
||||
// patch the rel attribute since the header makes nofollow valid for all links
|
||||
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
|
||||
}
|
||||
|
||||
// index organization
|
||||
StringBuilder idi = new StringBuilder(8);
|
||||
idi.append(Integer.toHexString((name + text + rel).hashCode()).toLowerCase());
|
||||
while (idi.length() < 8) idi.insert(0, '0');
|
||||
String source_id = ASCII.String(source.hash());
|
||||
String target_id = ASCII.String(target_url.hash());
|
||||
StringBuilder id = new StringBuilder(source_id).append(target_id).append(idi);
|
||||
SolrInputDocument edge = new SolrInputDocument();
|
||||
add(edge, WebgraphSchema.id, id.toString());
|
||||
add(edge, WebgraphSchema.target_order_i, target_order);
|
||||
if (allAttr || contains(WebgraphSchema.load_date_dt)) {
|
||||
Date loadDate = new Date();
|
||||
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
|
||||
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
|
||||
add(edge, WebgraphSchema.load_date_dt, loadDate);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
|
||||
final String source_url_string = source.toNormalform(false);
|
||||
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
|
||||
List<String> cs = new ArrayList<String>();
|
||||
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
|
||||
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
|
||||
}
|
||||
add(edge, WebgraphSchema.collection_sxt, cs);
|
||||
}
|
||||
|
||||
// add the source attributes
|
||||
add(edge, WebgraphSchema.source_id_s, source_id);
|
||||
int pr_source = source_url_string.indexOf("://",0);
|
||||
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
|
||||
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));
|
||||
Map<String, String> source_searchpart = source.getSearchpartMap();
|
||||
if (source_searchpart == null) {
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, 0);
|
||||
} else {
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_count_i)) add(edge, WebgraphSchema.source_parameter_count_i, source_searchpart.size());
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_key_sxt)) add(edge, WebgraphSchema.source_parameter_key_sxt, source_searchpart.keySet().toArray(new String[source_searchpart.size()]));
|
||||
if (allAttr || contains(WebgraphSchema.source_parameter_value_sxt)) add(edge, WebgraphSchema.source_parameter_value_sxt, source_searchpart.values().toArray(new String[source_searchpart.size()]));
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_chars_i)) add(edge, WebgraphSchema.source_chars_i, source_url_string.length());
|
||||
String source_host = null;
|
||||
if ((source_host = source.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(source_host);
|
||||
String subdomOrga = source_host.length() - dnc.length() <= 0 ? "" : source_host.substring(0, source_host.length() - dnc.length() - 1);
|
||||
int pp = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
|
||||
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_s)) add(edge, WebgraphSchema.source_host_s, source_host);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_id_s)) add(edge, WebgraphSchema.source_host_id_s, source.hosthash());
|
||||
if (allAttr || contains(WebgraphSchema.source_host_dnc_s)) add(edge, WebgraphSchema.source_host_dnc_s, dnc);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_organization_s)) add(edge, WebgraphSchema.source_host_organization_s, orga);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
|
||||
String source_file_name = source.getFileName();
|
||||
String source_file_ext = MultiProtocolURL.getFileExtension(source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
|
||||
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
|
||||
String[] paths = source.getPaths();
|
||||
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
|
||||
}
|
||||
if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
|
||||
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
|
||||
}
|
||||
|
||||
// add the source attributes about the target
|
||||
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
|
||||
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
|
||||
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
|
||||
|
||||
ImageEntry ientry = null;
|
||||
for (ImageEntry ie: images) {
|
||||
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
|
||||
}
|
||||
String alttext = ientry == null ? "" : ientry.alt();
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
|
||||
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
|
||||
|
||||
// add the target attributes
|
||||
add(edge, WebgraphSchema.target_id_s, target_id);
|
||||
final String target_url_string = target_url.toNormalform(false);
|
||||
int pr_target = target_url_string.indexOf("://",0);
|
||||
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
|
||||
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
|
||||
subgraph.urlAnchorTexts[ioidx].add(text);
|
||||
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
|
||||
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
|
||||
Map<String, String> target_searchpart = target_url.getSearchpartMap();
|
||||
if (target_searchpart == null) {
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, 0);
|
||||
} else {
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_count_i)) add(edge, WebgraphSchema.target_parameter_count_i, target_searchpart.size());
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_key_sxt)) add(edge, WebgraphSchema.target_parameter_key_sxt, target_searchpart.keySet().toArray(new String[target_searchpart.size()]));
|
||||
if (allAttr || contains(WebgraphSchema.target_parameter_value_sxt)) add(edge, WebgraphSchema.target_parameter_value_sxt, target_searchpart.values().toArray(new String[target_searchpart.size()]));
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_chars_i)) add(edge, WebgraphSchema.target_chars_i, target_url_string.length());
|
||||
String target_host = null;
|
||||
if ((target_host = target_url.getHost()) != null) {
|
||||
String dnc = Domains.getDNC(target_host);
|
||||
String subdomOrga = target_host.length() - dnc.length() <= 0 ? "" : target_host.substring(0, target_host.length() - dnc.length() - 1);
|
||||
int pp = subdomOrga.lastIndexOf('.');
|
||||
String subdom = (pp < 0) ? "" : subdomOrga.substring(0, pp);
|
||||
String orga = (pp < 0) ? subdomOrga : subdomOrga.substring(pp + 1);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_s)) add(edge, WebgraphSchema.target_host_s, target_host);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_id_s)) add(edge, WebgraphSchema.target_host_id_s, target_url.hosthash());
|
||||
if (allAttr || contains(WebgraphSchema.target_host_dnc_s)) add(edge, WebgraphSchema.target_host_dnc_s, dnc);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_organization_s)) add(edge, WebgraphSchema.target_host_organization_s, orga);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
|
||||
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
|
||||
String target_file_name = target_url.getFileName();
|
||||
String target_file_ext = MultiProtocolURL.getFileExtension(target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
|
||||
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
|
||||
}
|
||||
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
|
||||
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
|
||||
String[] paths = target_url.getPaths();
|
||||
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
|
||||
}
|
||||
|
||||
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i))) {
|
||||
if (target_url.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 0);
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 999);
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allAttr || contains(WebgraphSchema.process_sxt)) {
|
||||
List<String> pr = new ArrayList<String>();
|
||||
for (ProcessType t: processTypes) pr.add(t.name());
|
||||
add(edge, WebgraphSchema.process_sxt, pr);
|
||||
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
|
||||
add(edge, CollectionSchema.harvestkey_s, sourceName);
|
||||
}
|
||||
}
|
||||
|
||||
// return the edge
|
||||
return edge;
|
||||
}
|
||||
|
||||
|
||||
public int postprocessing(final Segment segment, ClickdepthCache clickdepthCache, final String harvestkey) {
|
||||
if (!this.contains(WebgraphSchema.process_sxt)) return 0;
|
||||
if (!segment.fulltext().writeToWebgraph()) return 0;
|
||||
if (!segment.fulltext().useWebgraph()) return 0;
|
||||
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
|
||||
// that means we must search for those entries.
|
||||
webgraphConnector.commit(true); // make sure that we have latest information that can be found
|
||||
|
@ -323,7 +336,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
// switch over tag types
|
||||
ProcessType tagtype = ProcessType.valueOf((String) tag);
|
||||
if (tagtype == ProcessType.CLICKDEPTH) {
|
||||
if (this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
if (this.contains(WebgraphSchema.source_clickdepth_i) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName());
|
||||
urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName());
|
||||
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
|
||||
|
@ -334,7 +347,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
}
|
||||
//ConcurrentLog.info("WebgraphConfiguration", "postprocessing webgraph source id " + id + ", url=" + protocol + "://" + urlstub + ", result: " + (changed ? "changed" : "not changed"));
|
||||
}
|
||||
if (this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
if (this.contains(WebgraphSchema.target_clickdepth_i) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName());
|
||||
urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
|
||||
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
|
||||
|
|
|
@ -174,7 +174,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
|
|||
}
|
||||
public int referencesCount() {
|
||||
// urlCitationIndex index might be null (= configuration option)
|
||||
return this.indexSegment.urlCitation() != null ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
|
||||
return this.indexSegment.connectedCitation() ? this.indexSegment.urlCitation().count(this.urlentry.hash()) : 0;
|
||||
}
|
||||
public int llocal() {
|
||||
return this.urlentry.llocal();
|
||||
|
|
|
@ -18,7 +18,7 @@ if exist DATA\SETTINGS\httpProxy.conf GoTo :RENAMEINDEX
|
|||
if exist DATA\SETTINGS\yacy.conf GoTo :GETSTARTOPTS
|
||||
|
||||
:STARTJAVA
|
||||
set javacmd=%javacmd% -XX:-UseGCOverheadLimit -Djava.net.preferIPv4Stack=true -Djava.awt.headless=true -Dfile.encoding=UTF-8
|
||||
set javacmd=%javacmd% -Djava.awt.headless=true -Dsolr.directoryFactory=solr.MMapDirectoryFactory -Dfile.encoding=UTF-8
|
||||
Rem Starting YaCy
|
||||
Echo Generated classpath:%CLASSPATH%
|
||||
Echo JRE Parameters:%javacmd%
|
||||
|
|
Loading…
Reference in New Issue
Block a user