replaced wget-requests with caching requests

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6242 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-07-24 14:52:27 +00:00 · 2009-07-24 14:52:27 +00:00 · 634a01a9a4
commit 634a01a9a4
parent c6c97f23ad
5 changed files with 67 additions and 74 deletions
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -3,9 +3,8 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Set;

-import de.anomic.crawler.retrieval.HTTPLoader;
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.document.parser.html.ContentScraper;
-import de.anomic.http.metadata.HeaderFramework;
 import de.anomic.http.metadata.RequestHeader;
 import de.anomic.search.Switchboard;
 import de.anomic.server.serverObjects;
@ -43,9 +42,7 @@ public class getpageinfo_p {
            if (actions.indexOf("title")>=0) {
                try {
                    final yacyURL u = new yacyURL(url, null);
-                    final RequestHeader reqHeader = new RequestHeader();
-                    reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url
-                    final ContentScraper scraper = ContentScraper.parseResource(u, reqHeader);
+                    final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
                    
                    // put the document title 
                    prop.putXML("title", scraper.getTitle());
--- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
+++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
@ -75,13 +75,42 @@ public final class LoaderDispatcher {
        return (HashSet<String>) this.supportedProtocols.clone();
    }
    
+    public static byte[] toBytes(Response response) {
+        if (response == null) return null;
+        return response.getContent();
+    }
+    
+    public Response load(final yacyURL url) throws IOException {
+        return load(url, true, false);
+    }
+    
+    public Response load(final yacyURL url, int cachePolicy) throws IOException {
+        return load(url, true, false, cachePolicy);
+    }
+    
    public Response load(
            final yacyURL url,
            final boolean forText,
            final boolean global
                    ) throws IOException {
+        return load(request(url, forText, global));
+    }
    
-        final Request centry = new Request(
+    public Response load(
+            final yacyURL url,
+            final boolean forText,
+            final boolean global,
+            int cacheStratgy
+    ) throws IOException {
+        return load(request(url, forText, global), cacheStratgy);
+    }
+    
+    public Request request(
+            final yacyURL url,
+            final boolean forText,
+            final boolean global
+                    ) throws IOException {
+        return new Request(
                    sb.peers.mySeed().hash, 
                    url, 
                    "", 
@ -99,11 +128,16 @@ public final class LoaderDispatcher {
                    0, 
                    0, 
                    0);
-        
-        return load(centry);
    }
    
    public Response load(final Request request) throws IOException {
+        CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
+        int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
+        if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
+        return load(request, cacheStrategy);
+    }
+    
+    public Response load(final Request request, int cacheStrategy) throws IOException {
        // get the protocol of the next URL
        final String protocol = request.url().getProtocol();
        final String host = request.url().getHost();
@ -115,8 +149,7 @@ public final class LoaderDispatcher {
        // check if we have the page in the cache

        CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
-        int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
-        if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
+        if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry
        
--- a/source/de/anomic/document/parser/html/ContentScraper.java
+++ b/source/de/anomic/document/parser/html/ContentScraper.java
@ -29,7 +29,6 @@ package de.anomic.document.parser.html;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.net.MalformedURLException;
@ -44,11 +43,8 @@ import java.util.Properties;

 import javax.swing.event.EventListenerList;

-import de.anomic.crawler.retrieval.HTTPLoader;
+import de.anomic.crawler.retrieval.LoaderDispatcher;
 import de.anomic.document.parser.htmlParser;
-import de.anomic.http.client.Client;
-import de.anomic.http.metadata.HeaderFramework;
-import de.anomic.http.metadata.RequestHeader;
 import de.anomic.kelondro.util.FileUtils;
 import de.anomic.server.serverCharBuffer;
 import de.anomic.yacy.yacyURL;
@ -511,25 +507,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return scraper;
    }
    
-    public static ContentScraper parseResource(final yacyURL location) throws IOException {
+    public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException {
        // load page
-        final RequestHeader reqHeader = new RequestHeader();
-        reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
-        return parseResource(location, reqHeader);
-    }
-    
-    public static ContentScraper parseResource(final yacyURL location, final RequestHeader reqHeader) throws IOException {
-        final Reader pageReader = Client.wgetReader(location.toString(), reqHeader, 10000);
-        if (pageReader == null) throw new IOException("no response from url " + location.toString());
+        byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy));
+        if (page == null) throw new IOException("no response from url " + location.toString());
        
        // scrape content
        final ContentScraper scraper = new ContentScraper(location);
        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-        try {
-            FileUtils.copy(pageReader, writer);
-        } finally {
-            pageReader.close();
-        }
+        writer.write(new String(page, "UTF-8"));
        
        return scraper;
    }
--- a/source/de/anomic/http/client/Client.java
+++ b/source/de/anomic/http/client/Client.java
@ -29,9 +29,6 @@ package de.anomic.http.client;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -77,7 +74,6 @@ import de.anomic.yacy.logging.Log;
 * 
 */
 public class Client {
-
    /**
     * "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency."
     * (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html)
@ -746,9 +742,11 @@ public class Client {
    public static byte[] wget(final String uri) {
        return wget(uri, new RequestHeader(), 10000, null);
    }
+    
    public static byte[] wget(final String uri, final RequestHeader header, final int timeout) {
        return wget(uri, header, timeout, null);
    }
+    
    public static byte[] wget(final String uri, final RequestHeader header, final int timeout, final String vhost) {
        assert uri != null : "precondition violated: uri != null";
        addHostHeader(header, vhost);
@ -769,28 +767,6 @@ public class Client {
        }
        return null;
    }
-    public static Reader wgetReader(final String uri) {
-        return wgetReader(uri, new RequestHeader(), 10000, null);
-    }
-    public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout) {
-        return wgetReader(uri, header, timeout, null);
-    }
-    public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout, final String vhost) {
-        assert uri != null : "precondition violated: uri != null";
-        addHostHeader(header, vhost);
-        final Client client = new Client(timeout, header);
-
-        // do the request
-        ResponseContainer response = null;
-        try {
-            response = client.GET(uri);
-            Charset charset = response.getResponseHeader().getCharSet();
-            return new InputStreamReader(response.getDataAsStream(), charset);
-        } catch (final IOException e) {
-            Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage());
-        }
-        return null;
-    }

    /**
     * adds a Host-header to the header if vhost is not null
--- a/source/de/anomic/yacy/yacyRelease.java
+++ b/source/de/anomic/yacy/yacyRelease.java
@ -44,6 +44,7 @@ import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;

+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.HTTPLoader;
 import de.anomic.document.parser.html.ContentScraper;
 import de.anomic.http.client.Client;
@ -231,7 +232,7 @@ public final class yacyRelease extends yacyVersion {
        // returns the version info if successful, null otherwise
        ContentScraper scraper;
        try {
-            scraper = ContentScraper.parseResource(location.getLocationURL());
+            scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
        } catch (final IOException e) {
            return null;
        }