ViewFile servlet: update index if newer,

so viewed text and metadata (stored) info is similar
- to archive it, use request with profile to allow indexing (defaultglobaltext) and update index 
   (the resource is loaded, parsed anyway, so it's not a expensive operation)

Request: remove 2 unused init parameter 
- number of anchors of the parent
- forkfactor sum of anchors of all ancestors
This commit is contained in:
reger 2014-12-05 01:13:37 +01:00
parent 226aea5914
commit ff18129def
15 changed files with 11 additions and 42 deletions

View File

@ -161,7 +161,7 @@ public class HostBrowser {
sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(),
0, 0, 0
0
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) waitloop: for (int i = 0; i < 30; i++) {

View File

@ -174,8 +174,6 @@ public class QuickCrawlLink_p {
(title==null)?"CRAWLING-ROOT":title,
new Date(),
pe.handle(),
0,
0,
0
));

View File

@ -178,7 +178,8 @@ public class ViewFile {
Response response = null;
try {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, agent);
// use sb.loader.requst( , , global=true) to use crawlprofile to allow index update
response = sb.loader.load(sb.loader.request(url, true, true), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, agent);
} catch (final IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());
@ -374,6 +375,10 @@ public class ViewFile {
prop.put("showSnippet_teasertext", desc);
prop.put("showSnippet", 1);
}
// update index with parsed resouce if index entry is older
if (urlEntry.loaddate().before(response.lastModified())) {
Switchboard.getSwitchboard().toIndexer(response);
}
if (document != null) document.close();
}
prop.put("error", "0");

View File

@ -102,8 +102,6 @@ public class push_p {
"", // the name of the document to crawl
new Date(), // current date
profile.handle(), // the name of the prefetch profile. This must not be null!
0, // depth the crawling depth of the entry
0, // anchors number of anchors of the parent
0); // forkfactor sum of anchors of all ancestors
Response response = new Response(
request,

View File

@ -78,8 +78,6 @@ public class rct_p {
"REMOTE-CRAWLING",
loaddate,
sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0));
} else {
env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

View File

@ -209,8 +209,6 @@ public final class CrawlStacker {
url.getNameProperty(),
new Date(),
profileHandle,
0,
0,
0
));
}
@ -250,8 +248,6 @@ public final class CrawlStacker {
MultiProtocolURL.unescape(entry.name),
entry.date,
profileHandle,
0,
0,
0));
}
} catch (final IOException e1) {
@ -276,8 +272,6 @@ public final class CrawlStacker {
"CRAWLING-ROOT",
new Date(),
pe.handle(),
0,
0,
0));
}

View File

@ -530,8 +530,6 @@ public class CrawlQueues {
item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "",
loaddate,
this.sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0
));
} else {

View File

@ -119,7 +119,7 @@ public class Request extends WorkflowJob
* @param referrerhash
*/
public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0, 0);
this(null, url, referrerhash, null, null, null, 0);
}
/**
@ -132,8 +132,6 @@ public class Request extends WorkflowJob
* @param appdate the time when the url was first time appeared
* @param profileHandle the name of the prefetch profile. This must not be null!
* @param depth the crawling depth of the entry
* @param anchors number of anchors of the parent
* @param forkfactor sum of anchors of all ancestors
*/
public Request(
final byte[] initiator,
@ -142,9 +140,7 @@ public class Request extends WorkflowJob
final String name,
final Date appdate,
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor) {
final int depth) {
// create new entry and store it into database
assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle

View File

@ -108,8 +108,6 @@ public class SitemapImporter extends Thread {
entry.url(),
entry.lastmod(new Date()),
this.crawlingProfile.handle(),
0,
0,
0
));
logger.info("New URL '" + entry.url() + "' added for loading.");

View File

@ -197,7 +197,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(), 0, 0, 0
pe.handle(), 0
));
}
}

View File

@ -74,8 +74,6 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0);
final Response cachedResponse = new Response(

View File

@ -180,8 +180,6 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
"",
responseHeaderLegacy.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
final Response yacyResponse = new Response(
yacyRequest,

View File

@ -128,8 +128,6 @@ public final class LoaderDispatcher {
((global) ?
this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
}

View File

@ -1941,8 +1941,6 @@ public final class Switchboard extends serverSwitch {
"",
surrogate.getDate(),
this.crawler.defaultSurrogateProfile.handle(),
0,
0,
0);
response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false, null);
final IndexingQueueEntry queueEntry =
@ -2673,9 +2671,7 @@ public final class Switchboard extends serverSwitch {
nextEntry.getValue(),
new Date(),
response.profile().handle(),
nextdepth,
0,
0));
nextdepth));
} catch (final MalformedURLException e ) {
ConcurrentLog.logException(e);
}
@ -3078,8 +3074,6 @@ public final class Switchboard extends serverSwitch {
"CRAWLING-ROOT",
new Date(),
profile.handle(),
0,
0,
0
));

View File

@ -357,8 +357,6 @@ public final class HTTPDProxyHandler {
"",
cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0);
final Response response = new Response(
request,
@ -474,8 +472,6 @@ public final class HTTPDProxyHandler {
"",
responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0);