mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a "fromCache" flag in Response object to omit one cache.has()
check during snippet generation. This should cause less blockings
This commit is contained in:
parent
81737dcb18
commit
7e0ddbd275
|
@ -134,6 +134,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
dirList.toString().getBytes());
|
||||
}
|
||||
} else {
|
||||
|
@ -253,6 +254,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
null);
|
||||
return response;
|
||||
}
|
||||
|
@ -268,6 +270,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
|
|
@ -30,8 +30,6 @@ import java.util.ArrayList;
|
|||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
import net.yacy.cora.document.Classification;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
|
@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segments;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
public class FileLoader {
|
||||
|
||||
|
@ -53,7 +52,7 @@ public class FileLoader {
|
|||
public FileLoader(final Switchboard sb, final Log log) {
|
||||
this.sb = sb;
|
||||
this.log = log;
|
||||
maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
|
@ -62,7 +61,7 @@ public class FileLoader {
|
|||
|
||||
RequestHeader requestHeader = new RequestHeader();
|
||||
if (request.referrerhash() != null) {
|
||||
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
|
||||
}
|
||||
|
||||
|
@ -89,13 +88,14 @@ public class FileLoader {
|
|||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
content.toString().getBytes());
|
||||
|
||||
return response;
|
||||
|
@ -117,25 +117,26 @@ public class FileLoader {
|
|||
}
|
||||
String parserError = null;
|
||||
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
|
||||
(size > maxFileSize && maxFileSize >= 0)) {
|
||||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
|
||||
// we know that we cannot process that file before loading
|
||||
// only the metadata is returned
|
||||
|
||||
if (parserError != null) {
|
||||
log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
} else {
|
||||
log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
}
|
||||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
url.toTokens().getBytes());
|
||||
return response;
|
||||
}
|
||||
|
@ -146,13 +147,14 @@ public class FileLoader {
|
|||
is.close();
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
|
|
@ -205,6 +205,7 @@ public final class HTTPLoader {
|
|||
header,
|
||||
Integer.toString(code),
|
||||
profile,
|
||||
false,
|
||||
responseBody
|
||||
);
|
||||
|
||||
|
@ -273,6 +274,7 @@ public final class HTTPLoader {
|
|||
header,
|
||||
Integer.toString(code),
|
||||
null,
|
||||
false,
|
||||
responseBody
|
||||
);
|
||||
|
||||
|
|
|
@ -66,6 +66,7 @@ public class Response {
|
|||
private final CrawlProfile profile;
|
||||
private byte[] content;
|
||||
private int status; // tracker indexing status, see status defs below
|
||||
private final boolean fromCache;
|
||||
|
||||
// doctype calculation
|
||||
public static char docType(final MultiProtocolURI url) {
|
||||
|
@ -151,6 +152,7 @@ public class Response {
|
|||
final ResponseHeader responseHeader,
|
||||
final String responseStatus,
|
||||
final CrawlProfile profile,
|
||||
final boolean fromCache,
|
||||
final byte[] content) {
|
||||
this.request = request;
|
||||
// request and response headers may be zero in case that we process surrogates
|
||||
|
@ -160,6 +162,7 @@ public class Response {
|
|||
this.profile = profile;
|
||||
this.status = QUEUE_STATE_FRESH;
|
||||
this.content = content;
|
||||
this.fromCache = fromCache;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -179,6 +182,7 @@ public class Response {
|
|||
this.profile = profile;
|
||||
this.status = QUEUE_STATE_FRESH;
|
||||
this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
|
||||
this.fromCache = true;
|
||||
}
|
||||
|
||||
public Response(
|
||||
|
@ -186,8 +190,9 @@ public class Response {
|
|||
final RequestHeader requestHeader,
|
||||
final ResponseHeader responseHeader,
|
||||
final String responseStatus,
|
||||
final CrawlProfile profile) {
|
||||
this(request, requestHeader, responseHeader, responseStatus, profile, null);
|
||||
final CrawlProfile profile,
|
||||
final boolean fromCache) {
|
||||
this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null);
|
||||
}
|
||||
|
||||
public void updateStatus(final int newStatus) {
|
||||
|
@ -198,6 +203,10 @@ public class Response {
|
|||
return this.responseHeader;
|
||||
}
|
||||
|
||||
public boolean fromCache() {
|
||||
return this.fromCache;
|
||||
}
|
||||
|
||||
public int getStatus() {
|
||||
return this.status;
|
||||
}
|
||||
|
|
|
@ -38,9 +38,6 @@ import java.util.List;
|
|||
import jcifs.smb.SmbException;
|
||||
import jcifs.smb.SmbFile;
|
||||
import jcifs.smb.SmbFileInputStream;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
import net.yacy.cora.document.Classification;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
|
@ -53,6 +50,7 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segments;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
public class SMBLoader {
|
||||
|
||||
|
@ -65,7 +63,7 @@ public class SMBLoader {
|
|||
public SMBLoader(final Switchboard sb, final Log log) {
|
||||
this.sb = sb;
|
||||
this.log = log;
|
||||
maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
|
||||
this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
|
||||
|
@ -75,7 +73,7 @@ public class SMBLoader {
|
|||
|
||||
RequestHeader requestHeader = new RequestHeader();
|
||||
if (request.referrerhash() != null) {
|
||||
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
|
||||
}
|
||||
|
||||
|
@ -109,13 +107,14 @@ public class SMBLoader {
|
|||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
content.toString().getBytes());
|
||||
|
||||
return response;
|
||||
|
@ -137,25 +136,26 @@ public class SMBLoader {
|
|||
}
|
||||
String parserError = null;
|
||||
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
|
||||
(size > maxFileSize && maxFileSize >= 0)) {
|
||||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
|
||||
// we know that we cannot process that file before loading
|
||||
// only the metadata is returned
|
||||
|
||||
if (parserError != null) {
|
||||
log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
} else {
|
||||
log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
}
|
||||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
url.toTokens().getBytes());
|
||||
return response;
|
||||
}
|
||||
|
@ -166,13 +166,14 @@ public class SMBLoader {
|
|||
is.close();
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
|
|
@ -404,7 +404,8 @@ public final class HTTPDProxyHandler {
|
|||
requestHeader,
|
||||
cachedResponseHeader,
|
||||
"200 OK",
|
||||
sb.crawler.defaultProxyProfile
|
||||
sb.crawler.defaultProxyProfile,
|
||||
false
|
||||
);
|
||||
final byte[] cacheContent = Cache.getContent(url.hash());
|
||||
if (cacheContent != null && response.isFreshForProxy()) {
|
||||
|
@ -548,7 +549,8 @@ public final class HTTPDProxyHandler {
|
|||
requestHeader,
|
||||
responseHeader,
|
||||
Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
|
||||
sb.crawler.defaultProxyProfile
|
||||
sb.crawler.defaultProxyProfile,
|
||||
false
|
||||
);
|
||||
final String storeError = response.shallStoreCacheForProxy();
|
||||
final boolean storeHTCache = response.profile().storeHTCache();
|
||||
|
|
|
@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
|
|||
*/
|
||||
public final byte[] hash() {
|
||||
// in case that the object was initialized without a known url hash, compute it now
|
||||
if (this.hash == null) {
|
||||
// we check the this.hash value twice to avoid synchronization where possible
|
||||
synchronized (this.protocol) {
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
}
|
||||
}
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
|
@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
|
|||
@Override
|
||||
public final boolean isLocal() {
|
||||
if (this.isFile()) return true;
|
||||
if (this.hash == null) synchronized (this.protocol) {
|
||||
// this is synchronized because another thread may also call the same method in between
|
||||
// that is the reason that this.hash is checked again
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
}
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
return domDomain(this.hash) == 7;
|
||||
}
|
||||
|
||||
|
|
|
@ -218,6 +218,7 @@ public final class LoaderDispatcher {
|
|||
cachedResponse,
|
||||
"200",
|
||||
crawlProfile,
|
||||
true,
|
||||
content);
|
||||
|
||||
// check which caching strategy shall be used
|
||||
|
|
|
@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch
|
|||
0,
|
||||
0,
|
||||
0);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false);
|
||||
final indexingQueueEntry queueEntry =
|
||||
new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] {
|
||||
document
|
||||
|
|
|
@ -201,12 +201,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
removeMatchingHashes(row.dc_subject(), remainingHashes);
|
||||
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
|
||||
|
||||
boolean isInCache = de.anomic.crawler.Cache.has(url.hash());
|
||||
|
||||
if (remainingHashes.size() == 0) {
|
||||
// the snippet is fully inside the metadata!
|
||||
|
||||
if (isInCache) {
|
||||
if (de.anomic.crawler.Cache.has(url.hash())) {
|
||||
// get the sentences from the cache
|
||||
final Request request = loader.request(url, true, reindexing);
|
||||
Response response;
|
||||
|
@ -261,7 +259,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
return;
|
||||
}
|
||||
|
||||
if (!isInCache && response != null) {
|
||||
if (!response.fromCache()) {
|
||||
// place entry on indexing queue
|
||||
Switchboard.getSwitchboard().toIndexer(response);
|
||||
this.resultStatus = ResultClass.SOURCE_WEB;
|
||||
|
|
Loading…
Reference in New Issue
Block a user