mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a "fromCache" flag in Response object to omit one cache.has()
check during snippet generation. This should cause less blockings
This commit is contained in:
parent
81737dcb18
commit
7e0ddbd275
|
@ -134,6 +134,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
dirList.toString().getBytes());
|
||||
}
|
||||
} else {
|
||||
|
@ -253,6 +254,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
null);
|
||||
return response;
|
||||
}
|
||||
|
@ -268,6 +270,7 @@ public class FTPLoader {
|
|||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
|
|
@ -11,12 +11,12 @@
|
|||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
|
@ -30,8 +30,6 @@ import java.util.ArrayList;
|
|||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
import net.yacy.cora.document.Classification;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
|
@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segments;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
public class FileLoader {
|
||||
|
||||
|
@ -53,19 +52,19 @@ public class FileLoader {
|
|||
public FileLoader(final Switchboard sb, final Log log) {
|
||||
this.sb = sb;
|
||||
this.log = log;
|
||||
maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
DigestURI url = request.url();
|
||||
if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());
|
||||
|
||||
RequestHeader requestHeader = new RequestHeader();
|
||||
if (request.referrerhash() != null) {
|
||||
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
|
||||
}
|
||||
|
||||
|
||||
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
|
||||
String[] l = null;
|
||||
try {l = url.list();} catch (IOException e) {}
|
||||
|
@ -83,30 +82,31 @@ public class FileLoader {
|
|||
for (String s: l) {
|
||||
list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
|
||||
}
|
||||
|
||||
|
||||
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
|
||||
|
||||
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
content.toString().getBytes());
|
||||
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
// create response header
|
||||
String mime = Classification.ext2mime(url.getFileExtension());
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
|
||||
|
||||
|
||||
// check mime type and availability of parsers
|
||||
// and also check resource size and limitation of the size
|
||||
long size;
|
||||
|
@ -117,42 +117,44 @@ public class FileLoader {
|
|||
}
|
||||
String parserError = null;
|
||||
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
|
||||
(size > maxFileSize && maxFileSize >= 0)) {
|
||||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
|
||||
// we know that we cannot process that file before loading
|
||||
// only the metadata is returned
|
||||
|
||||
|
||||
if (parserError != null) {
|
||||
log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
} else {
|
||||
log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
}
|
||||
|
||||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
url.toTokens().getBytes());
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
// load the resource
|
||||
InputStream is = url.getInputStream(null, -1);
|
||||
byte[] b = FileUtils.read(is);
|
||||
is.close();
|
||||
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
|
|
@ -205,6 +205,7 @@ public final class HTTPLoader {
|
|||
header,
|
||||
Integer.toString(code),
|
||||
profile,
|
||||
false,
|
||||
responseBody
|
||||
);
|
||||
|
||||
|
@ -273,6 +274,7 @@ public final class HTTPLoader {
|
|||
header,
|
||||
Integer.toString(code),
|
||||
null,
|
||||
false,
|
||||
responseBody
|
||||
);
|
||||
|
||||
|
|
|
@ -66,6 +66,7 @@ public class Response {
|
|||
private final CrawlProfile profile;
|
||||
private byte[] content;
|
||||
private int status; // tracker indexing status, see status defs below
|
||||
private final boolean fromCache;
|
||||
|
||||
// doctype calculation
|
||||
public static char docType(final MultiProtocolURI url) {
|
||||
|
@ -151,6 +152,7 @@ public class Response {
|
|||
final ResponseHeader responseHeader,
|
||||
final String responseStatus,
|
||||
final CrawlProfile profile,
|
||||
final boolean fromCache,
|
||||
final byte[] content) {
|
||||
this.request = request;
|
||||
// request and response headers may be zero in case that we process surrogates
|
||||
|
@ -160,6 +162,7 @@ public class Response {
|
|||
this.profile = profile;
|
||||
this.status = QUEUE_STATE_FRESH;
|
||||
this.content = content;
|
||||
this.fromCache = fromCache;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -179,6 +182,7 @@ public class Response {
|
|||
this.profile = profile;
|
||||
this.status = QUEUE_STATE_FRESH;
|
||||
this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
|
||||
this.fromCache = true;
|
||||
}
|
||||
|
||||
public Response(
|
||||
|
@ -186,8 +190,9 @@ public class Response {
|
|||
final RequestHeader requestHeader,
|
||||
final ResponseHeader responseHeader,
|
||||
final String responseStatus,
|
||||
final CrawlProfile profile) {
|
||||
this(request, requestHeader, responseHeader, responseStatus, profile, null);
|
||||
final CrawlProfile profile,
|
||||
final boolean fromCache) {
|
||||
this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null);
|
||||
}
|
||||
|
||||
public void updateStatus(final int newStatus) {
|
||||
|
@ -198,6 +203,10 @@ public class Response {
|
|||
return this.responseHeader;
|
||||
}
|
||||
|
||||
public boolean fromCache() {
|
||||
return this.fromCache;
|
||||
}
|
||||
|
||||
public int getStatus() {
|
||||
return this.status;
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
|
@ -38,9 +38,6 @@ import java.util.List;
|
|||
import jcifs.smb.SmbException;
|
||||
import jcifs.smb.SmbFile;
|
||||
import jcifs.smb.SmbFileInputStream;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
import net.yacy.cora.document.Classification;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
|
@ -53,11 +50,12 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segments;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
|
||||
public class SMBLoader {
|
||||
|
||||
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
|
||||
|
||||
|
||||
private final Switchboard sb;
|
||||
private final Log log;
|
||||
private final long maxFileSize;
|
||||
|
@ -65,20 +63,20 @@ public class SMBLoader {
|
|||
public SMBLoader(final Switchboard sb, final Log log) {
|
||||
this.sb = sb;
|
||||
this.log = log;
|
||||
maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
|
||||
this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
DigestURI url = request.url();
|
||||
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
|
||||
|
||||
RequestHeader requestHeader = new RequestHeader();
|
||||
if (request.referrerhash() != null) {
|
||||
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
|
||||
}
|
||||
|
||||
|
||||
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
|
||||
String[] l = null;
|
||||
try {l = url.list();} catch (IOException e) {}
|
||||
|
@ -103,30 +101,31 @@ public class SMBLoader {
|
|||
}
|
||||
list.add(u + s);
|
||||
}
|
||||
|
||||
|
||||
StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
|
||||
|
||||
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
content.toString().getBytes());
|
||||
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
// create response header
|
||||
String mime = Classification.ext2mime(url.getFileExtension());
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
|
||||
|
||||
|
||||
// check mime type and availability of parsers
|
||||
// and also check resource size and limitation of the size
|
||||
long size;
|
||||
|
@ -137,46 +136,48 @@ public class SMBLoader {
|
|||
}
|
||||
String parserError = null;
|
||||
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
|
||||
(size > maxFileSize && maxFileSize >= 0)) {
|
||||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
|
||||
// we know that we cannot process that file before loading
|
||||
// only the metadata is returned
|
||||
|
||||
|
||||
if (parserError != null) {
|
||||
log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
} else {
|
||||
log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
}
|
||||
|
||||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
url.toTokens().getBytes());
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
// load the resource
|
||||
InputStream is = url.getInputStream(null, -1);
|
||||
byte[] b = FileUtils.read(is);
|
||||
is.close();
|
||||
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
//jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" );
|
||||
//NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password");
|
||||
|
|
|
@ -404,7 +404,8 @@ public final class HTTPDProxyHandler {
|
|||
requestHeader,
|
||||
cachedResponseHeader,
|
||||
"200 OK",
|
||||
sb.crawler.defaultProxyProfile
|
||||
sb.crawler.defaultProxyProfile,
|
||||
false
|
||||
);
|
||||
final byte[] cacheContent = Cache.getContent(url.hash());
|
||||
if (cacheContent != null && response.isFreshForProxy()) {
|
||||
|
@ -548,7 +549,8 @@ public final class HTTPDProxyHandler {
|
|||
requestHeader,
|
||||
responseHeader,
|
||||
Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
|
||||
sb.crawler.defaultProxyProfile
|
||||
sb.crawler.defaultProxyProfile,
|
||||
false
|
||||
);
|
||||
final String storeError = response.shallStoreCacheForProxy();
|
||||
final boolean storeHTCache = response.profile().storeHTCache();
|
||||
|
|
|
@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
|
|||
*/
|
||||
public final byte[] hash() {
|
||||
// in case that the object was initialized without a known url hash, compute it now
|
||||
if (this.hash == null) {
|
||||
// we check the this.hash value twice to avoid synchronization where possible
|
||||
synchronized (this.protocol) {
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
}
|
||||
}
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
|
@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
|
|||
@Override
|
||||
public final boolean isLocal() {
|
||||
if (this.isFile()) return true;
|
||||
if (this.hash == null) synchronized (this.protocol) {
|
||||
// this is synchronized because another thread may also call the same method in between
|
||||
// that is the reason that this.hash is checked again
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
}
|
||||
if (this.hash == null) this.hash = urlHashComputation();
|
||||
return domDomain(this.hash) == 7;
|
||||
}
|
||||
|
||||
|
|
|
@ -218,6 +218,7 @@ public final class LoaderDispatcher {
|
|||
cachedResponse,
|
||||
"200",
|
||||
crawlProfile,
|
||||
true,
|
||||
content);
|
||||
|
||||
// check which caching strategy shall be used
|
||||
|
|
|
@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch
|
|||
0,
|
||||
0,
|
||||
0);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false);
|
||||
final indexingQueueEntry queueEntry =
|
||||
new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] {
|
||||
document
|
||||
|
|
|
@ -201,12 +201,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
removeMatchingHashes(row.dc_subject(), remainingHashes);
|
||||
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
|
||||
|
||||
boolean isInCache = de.anomic.crawler.Cache.has(url.hash());
|
||||
|
||||
if (remainingHashes.size() == 0) {
|
||||
// the snippet is fully inside the metadata!
|
||||
|
||||
if (isInCache) {
|
||||
if (de.anomic.crawler.Cache.has(url.hash())) {
|
||||
// get the sentences from the cache
|
||||
final Request request = loader.request(url, true, reindexing);
|
||||
Response response;
|
||||
|
@ -261,7 +259,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
return;
|
||||
}
|
||||
|
||||
if (!isInCache && response != null) {
|
||||
if (!response.fromCache()) {
|
||||
// place entry on indexing queue
|
||||
Switchboard.getSwitchboard().toIndexer(response);
|
||||
this.resultStatus = ResultClass.SOURCE_WEB;
|
||||
|
|
Loading…
Reference in New Issue
Block a user