mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
replaced wget-requests with caching requests
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6242 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
c6c97f23ad
commit
634a01a9a4
|
@ -3,9 +3,8 @@ import java.io.IOException;
|
|||
import java.net.MalformedURLException;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.document.parser.html.ContentScraper;
|
||||
import de.anomic.http.metadata.HeaderFramework;
|
||||
import de.anomic.http.metadata.RequestHeader;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
|
@ -43,9 +42,7 @@ public class getpageinfo_p {
|
|||
if (actions.indexOf("title")>=0) {
|
||||
try {
|
||||
final yacyURL u = new yacyURL(url, null);
|
||||
final RequestHeader reqHeader = new RequestHeader();
|
||||
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url
|
||||
final ContentScraper scraper = ContentScraper.parseResource(u, reqHeader);
|
||||
final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
|
||||
|
||||
// put the document title
|
||||
prop.putXML("title", scraper.getTitle());
|
||||
|
|
|
@ -75,13 +75,42 @@ public final class LoaderDispatcher {
|
|||
return (HashSet<String>) this.supportedProtocols.clone();
|
||||
}
|
||||
|
||||
public static byte[] toBytes(Response response) {
|
||||
if (response == null) return null;
|
||||
return response.getContent();
|
||||
}
|
||||
|
||||
public Response load(final yacyURL url) throws IOException {
|
||||
return load(url, true, false);
|
||||
}
|
||||
|
||||
public Response load(final yacyURL url, int cachePolicy) throws IOException {
|
||||
return load(url, true, false, cachePolicy);
|
||||
}
|
||||
|
||||
public Response load(
|
||||
final yacyURL url,
|
||||
final boolean forText,
|
||||
final boolean global
|
||||
) throws IOException {
|
||||
return load(request(url, forText, global));
|
||||
}
|
||||
|
||||
final Request centry = new Request(
|
||||
public Response load(
|
||||
final yacyURL url,
|
||||
final boolean forText,
|
||||
final boolean global,
|
||||
int cacheStratgy
|
||||
) throws IOException {
|
||||
return load(request(url, forText, global), cacheStratgy);
|
||||
}
|
||||
|
||||
public Request request(
|
||||
final yacyURL url,
|
||||
final boolean forText,
|
||||
final boolean global
|
||||
) throws IOException {
|
||||
return new Request(
|
||||
sb.peers.mySeed().hash,
|
||||
url,
|
||||
"",
|
||||
|
@ -99,11 +128,16 @@ public final class LoaderDispatcher {
|
|||
0,
|
||||
0,
|
||||
0);
|
||||
|
||||
return load(centry);
|
||||
}
|
||||
|
||||
public Response load(final Request request) throws IOException {
|
||||
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
|
||||
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
|
||||
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
|
||||
return load(request, cacheStrategy);
|
||||
}
|
||||
|
||||
public Response load(final Request request, int cacheStrategy) throws IOException {
|
||||
// get the protocol of the next URL
|
||||
final String protocol = request.url().getProtocol();
|
||||
final String host = request.url().getHost();
|
||||
|
@ -115,8 +149,7 @@ public final class LoaderDispatcher {
|
|||
// check if we have the page in the cache
|
||||
|
||||
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
|
||||
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
|
||||
if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
|
||||
if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
|
||||
// we have passed a first test if caching is allowed
|
||||
// now see if there is a cache entry
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ package de.anomic.document.parser.html;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -44,11 +43,8 @@ import java.util.Properties;
|
|||
|
||||
import javax.swing.event.EventListenerList;
|
||||
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.crawler.retrieval.LoaderDispatcher;
|
||||
import de.anomic.document.parser.htmlParser;
|
||||
import de.anomic.http.client.Client;
|
||||
import de.anomic.http.metadata.HeaderFramework;
|
||||
import de.anomic.http.metadata.RequestHeader;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.server.serverCharBuffer;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
@ -511,25 +507,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
return scraper;
|
||||
}
|
||||
|
||||
public static ContentScraper parseResource(final yacyURL location) throws IOException {
|
||||
public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException {
|
||||
// load page
|
||||
final RequestHeader reqHeader = new RequestHeader();
|
||||
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
|
||||
return parseResource(location, reqHeader);
|
||||
}
|
||||
|
||||
public static ContentScraper parseResource(final yacyURL location, final RequestHeader reqHeader) throws IOException {
|
||||
final Reader pageReader = Client.wgetReader(location.toString(), reqHeader, 10000);
|
||||
if (pageReader == null) throw new IOException("no response from url " + location.toString());
|
||||
byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy));
|
||||
if (page == null) throw new IOException("no response from url " + location.toString());
|
||||
|
||||
// scrape content
|
||||
final ContentScraper scraper = new ContentScraper(location);
|
||||
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
|
||||
try {
|
||||
FileUtils.copy(pageReader, writer);
|
||||
} finally {
|
||||
pageReader.close();
|
||||
}
|
||||
writer.write(new String(page, "UTF-8"));
|
||||
|
||||
return scraper;
|
||||
}
|
||||
|
|
|
@ -29,9 +29,6 @@ package de.anomic.http.client;
|
|||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -77,7 +74,6 @@ import de.anomic.yacy.logging.Log;
|
|||
*
|
||||
*/
|
||||
public class Client {
|
||||
|
||||
/**
|
||||
* "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency."
|
||||
* (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html)
|
||||
|
@ -746,9 +742,11 @@ public class Client {
|
|||
public static byte[] wget(final String uri) {
|
||||
return wget(uri, new RequestHeader(), 10000, null);
|
||||
}
|
||||
|
||||
public static byte[] wget(final String uri, final RequestHeader header, final int timeout) {
|
||||
return wget(uri, header, timeout, null);
|
||||
}
|
||||
|
||||
public static byte[] wget(final String uri, final RequestHeader header, final int timeout, final String vhost) {
|
||||
assert uri != null : "precondition violated: uri != null";
|
||||
addHostHeader(header, vhost);
|
||||
|
@ -769,28 +767,6 @@ public class Client {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
public static Reader wgetReader(final String uri) {
|
||||
return wgetReader(uri, new RequestHeader(), 10000, null);
|
||||
}
|
||||
public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout) {
|
||||
return wgetReader(uri, header, timeout, null);
|
||||
}
|
||||
public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout, final String vhost) {
|
||||
assert uri != null : "precondition violated: uri != null";
|
||||
addHostHeader(header, vhost);
|
||||
final Client client = new Client(timeout, header);
|
||||
|
||||
// do the request
|
||||
ResponseContainer response = null;
|
||||
try {
|
||||
response = client.GET(uri);
|
||||
Charset charset = response.getResponseHeader().getCharSet();
|
||||
return new InputStreamReader(response.getDataAsStream(), charset);
|
||||
} catch (final IOException e) {
|
||||
Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* adds a Host-header to the header if vhost is not null
|
||||
|
|
|
@ -44,6 +44,7 @@ import java.util.Map;
|
|||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.document.parser.html.ContentScraper;
|
||||
import de.anomic.http.client.Client;
|
||||
|
@ -231,7 +232,7 @@ public final class yacyRelease extends yacyVersion {
|
|||
// returns the version info if successful, null otherwise
|
||||
ContentScraper scraper;
|
||||
try {
|
||||
scraper = ContentScraper.parseResource(location.getLocationURL());
|
||||
scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
} catch (final IOException e) {
|
||||
return null;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user