diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index acb5e5344..202c87034 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -134,6 +134,7 @@ public class FTPLoader { responseHeader, "200", profile, + false, dirList.toString().getBytes()); } } else { @@ -253,6 +254,7 @@ public class FTPLoader { responseHeader, "200", profile, + false, null); return response; } @@ -268,6 +270,7 @@ public class FTPLoader { responseHeader, "200", profile, + false, b); return response; } diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index a2b9e6687..87451c169 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -30,8 +30,6 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import de.anomic.crawler.CrawlProfile; - import net.yacy.cora.document.Classification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.search.index.Segments; +import de.anomic.crawler.CrawlProfile; public class FileLoader { @@ -53,19 +52,19 @@ public class FileLoader { public FileLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); + this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); } - + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { DigestURI url = request.url(); if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); } - + // process directories: transform them to html with meta robots=noindex (using the ftpc lib) String[] l = null; try {l = url.list();} catch (IOException e) {} @@ -83,30 +82,31 @@ public class FileLoader { for (String s: l) { list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s); } - + StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - + ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, content.toString().getBytes()); - + return response; } - + // create response header String mime = Classification.ext2mime(url.getFileExtension()); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); - + // check mime type and availability of parsers // and also check resource size and limitation of the size long size; @@ -117,42 +117,44 @@ public class FileLoader { } String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || - (size > maxFileSize && maxFileSize >= 0)) { + (size > this.maxFileSize && this.maxFileSize >= 0)) { // we know that we cannot process that file before loading // only the metadata is returned - + if (parserError != null) { - log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); + this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); } else { - log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); + this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); } - + // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, url.toTokens().getBytes()); return response; } - + // load the resource InputStream is = url.getInputStream(null, -1); byte[] b = FileUtils.read(is); is.close(); - + // create response with loaded content - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, b); return response; } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index d68ccc743..a3547feda 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -205,6 +205,7 @@ public final class HTTPLoader { header, Integer.toString(code), profile, + false, responseBody ); @@ -273,6 +274,7 @@ public final class HTTPLoader { header, Integer.toString(code), null, + false, responseBody ); diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index a3d0eaa8b..c6a854d20 100644 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -66,6 +66,7 @@ public class Response { private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below + private final boolean fromCache; // doctype calculation public static char docType(final MultiProtocolURI url) { @@ -151,6 +152,7 @@ public class Response { final ResponseHeader responseHeader, final String responseStatus, final CrawlProfile profile, + final boolean fromCache, final byte[] content) { this.request = request; // request and response headers may be zero in case that we process surrogates @@ -160,6 +162,7 @@ public class Response { this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = content; + this.fromCache = fromCache; } /** @@ -179,6 +182,7 @@ public class Response { this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes(); + this.fromCache = true; } public Response( @@ -186,8 +190,9 @@ public class Response { final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, - final CrawlProfile profile) { - this(request, requestHeader, responseHeader, responseStatus, profile, null); + final CrawlProfile profile, + final boolean fromCache) { + this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null); } public void updateStatus(final int newStatus) { @@ -198,6 +203,10 @@ public class Response { return this.responseHeader; } + public boolean fromCache() { + return this.fromCache; + } + public int getStatus() { return this.status; } diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index d3e516b00..e968263be 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -38,9 +38,6 @@ import java.util.List; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; - -import de.anomic.crawler.CrawlProfile; - import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; @@ -53,11 +50,12 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.search.index.Segments; +import de.anomic.crawler.CrawlProfile; public class SMBLoader { public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; - + private final Switchboard sb; private final Log log; private final long maxFileSize; @@ -65,20 +63,20 @@ public class SMBLoader { public SMBLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l); + this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l); } - - + + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { DigestURI url = request.url(); if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { - DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); } - + // process directories: transform them to html with meta robots=noindex (using the ftpc lib) String[] l = null; try {l = url.list();} catch (IOException e) {} @@ -103,30 +101,31 @@ public class SMBLoader { } list.add(u + s); } - + StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - + ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, content.toString().getBytes()); - + return response; } - + // create response header String mime = Classification.ext2mime(url.getFileExtension()); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); - + // check mime type and availability of parsers // and also check resource size and limitation of the size long size; @@ -137,46 +136,48 @@ public class SMBLoader { } String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || - (size > maxFileSize && maxFileSize >= 0)) { + (size > this.maxFileSize && this.maxFileSize >= 0)) { // we know that we cannot process that file before loading // only the metadata is returned - + if (parserError != null) { - log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); + this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); } else { - log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); + this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); } - + // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, url.toTokens().getBytes()); return response; } - + // load the resource InputStream is = url.getInputStream(null, -1); byte[] b = FileUtils.read(is); is.close(); - + // create response with loaded content - final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); + final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); Response response = new Response( - request, + request, requestHeader, responseHeader, "200", profile, + false, b); return response; } - + public static void main(String[] args) { //jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" ); //NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password"); diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 54f83ce31..7fcc86ac9 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -404,7 +404,8 @@ public final class HTTPDProxyHandler { requestHeader, cachedResponseHeader, "200 OK", - sb.crawler.defaultProxyProfile + sb.crawler.defaultProxyProfile, + false ); final byte[] cacheContent = Cache.getContent(url.hash()); if (cacheContent != null && response.isFreshForProxy()) { @@ -548,7 +549,8 @@ public final class HTTPDProxyHandler { requestHeader, responseHeader, Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), - sb.crawler.defaultProxyProfile + sb.crawler.defaultProxyProfile, + false ); final String storeError = response.shallStoreCacheForProxy(); final boolean storeHTCache = response.profile().storeHTCache(); diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index 6e2cd9491..5b2b95aab 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable { */ public final byte[] hash() { // in case that the object was initialized without a known url hash, compute it now - if (this.hash == null) { - // we check the this.hash value twice to avoid synchronization where possible - synchronized (this.protocol) { - if (this.hash == null) this.hash = urlHashComputation(); - } - } + if (this.hash == null) this.hash = urlHashComputation(); return this.hash; } @@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable { @Override public final boolean isLocal() { if (this.isFile()) return true; - if (this.hash == null) synchronized (this.protocol) { - // this is synchronized because another thread may also call the same method in between - // that is the reason that this.hash is checked again - if (this.hash == null) this.hash = urlHashComputation(); - } + if (this.hash == null) this.hash = urlHashComputation(); return domDomain(this.hash) == 7; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 254f0c66a..82cef5fd4 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -218,6 +218,7 @@ public final class LoaderDispatcher { cachedResponse, "200", crawlProfile, + true, content); // check which caching strategy shall be used diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index de1ac8465..171a13905 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch 0, 0, 0); - response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); + response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false); final indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] { document diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 2e00cf792..c0d0c5fa9 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -201,12 +201,10 @@ public class TextSnippet implements Comparable, Comparator, Comparator