mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
better integration of blacklist according to use case
This commit is contained in:
parent
c18fa9fa75
commit
1825f165b8
|
@ -196,7 +196,7 @@ public class Bookmarks {
|
||||||
// try to get the bookmark from the LURL database
|
// try to get the bookmark from the LURL database
|
||||||
final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
|
final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
|
||||||
if (urlentry != null) try {
|
if (urlentry != null) try {
|
||||||
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE));
|
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE, null));
|
||||||
prop.put("mode_edit", "0"); // create mode
|
prop.put("mode_edit", "0"); // create mode
|
||||||
prop.put("mode_url", urlentry.url().toNormalform(false, true));
|
prop.put("mode_url", urlentry.url().toNormalform(false, true));
|
||||||
prop.putHTML("mode_title", urlentry.dc_title());
|
prop.putHTML("mode_title", urlentry.dc_title());
|
||||||
|
|
|
@ -50,6 +50,7 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
import net.yacy.kelondro.util.FileUtils;
|
||||||
import net.yacy.peers.NewsPool;
|
import net.yacy.peers.NewsPool;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.SwitchboardConstants;
|
import net.yacy.search.SwitchboardConstants;
|
||||||
import net.yacy.search.index.Segment;
|
import net.yacy.search.index.Segment;
|
||||||
|
@ -322,7 +323,7 @@ public class Crawler_p {
|
||||||
sb.crawlQueues.errorURL.remove(urlhash);
|
sb.crawlQueues.errorURL.remove(urlhash);
|
||||||
|
|
||||||
// get a scraper to get the title
|
// get a scraper to get the title
|
||||||
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
|
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
|
||||||
final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
|
final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
|
||||||
final String description = scraper.dc_description();
|
final String description = scraper.dc_description();
|
||||||
|
|
||||||
|
@ -544,7 +545,7 @@ public class Crawler_p {
|
||||||
try {
|
try {
|
||||||
final DigestURI sitelistURL = new DigestURI(crawlingStart);
|
final DigestURI sitelistURL = new DigestURI(crawlingStart);
|
||||||
// download document
|
// download document
|
||||||
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
|
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
|
||||||
// String title = scraper.getTitle();
|
// String title = scraper.getTitle();
|
||||||
// String description = scraper.getDescription();
|
// String description = scraper.getDescription();
|
||||||
|
|
||||||
|
@ -647,11 +648,11 @@ public class Crawler_p {
|
||||||
|
|
||||||
private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
|
private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
|
||||||
if (!recrawlIfOlderCheck) return 0L;
|
if (!recrawlIfOlderCheck) return 0L;
|
||||||
if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
|
if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
|
||||||
if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
|
if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
|
||||||
if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
|
if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
|
||||||
if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L;
|
if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L;
|
||||||
return System.currentTimeMillis() - (long) recrawlIfOlderNumber;
|
return System.currentTimeMillis() - recrawlIfOlderNumber;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setPerformance(final Switchboard sb, final serverObjects post) {
|
private static void setPerformance(final Switchboard sb, final serverObjects post) {
|
||||||
|
|
|
@ -65,7 +65,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon0Load")) {
|
if (post.containsKey("geon0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
|
||||||
|
@ -107,7 +107,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon1Load")) {
|
if (post.containsKey("geon1Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
|
||||||
|
@ -149,7 +149,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon2Load")) {
|
if (post.containsKey("geon2Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
|
||||||
|
@ -191,7 +191,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geo1Load")) {
|
if (post.containsKey("geo1Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
|
||||||
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
|
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
|
||||||
|
@ -234,7 +234,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("drw0Load")) {
|
if (post.containsKey("drw0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
|
||||||
LibraryProvider.activateDeReWo();
|
LibraryProvider.activateDeReWo();
|
||||||
|
@ -278,7 +278,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("pnd0Load")) {
|
if (post.containsKey("pnd0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
|
||||||
LibraryProvider.activatePND();
|
LibraryProvider.activatePND();
|
||||||
|
|
|
@ -41,6 +41,7 @@ import net.yacy.kelondro.blob.Tables.Row;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import de.anomic.crawler.RSSLoader;
|
import de.anomic.crawler.RSSLoader;
|
||||||
import de.anomic.crawler.retrieval.Response;
|
import de.anomic.crawler.retrieval.Response;
|
||||||
|
@ -255,7 +256,7 @@ public class Load_RSS_p {
|
||||||
RSSReader rss = null;
|
RSSReader rss = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
prop.put("url", url.toNormalform(true, false));
|
prop.put("url", url.toNormalform(true, false));
|
||||||
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
|
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER);
|
||||||
final byte[] resource = response == null ? null : response.getContent();
|
final byte[] resource = response == null ? null : response.getContent();
|
||||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
|
|
|
@ -163,7 +163,7 @@ public class ViewFile {
|
||||||
|
|
||||||
Response response = null;
|
Response response = null;
|
||||||
try {
|
try {
|
||||||
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, true);
|
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
prop.put("error", "4");
|
prop.put("error", "4");
|
||||||
prop.put("error_errorText", "error loading resource: " + e.getMessage());
|
prop.put("error_errorText", "error loading resource: " + e.getMessage());
|
||||||
|
|
|
@ -42,6 +42,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
import net.yacy.kelondro.util.FileUtils;
|
||||||
import net.yacy.kelondro.util.MemoryControl;
|
import net.yacy.kelondro.util.MemoryControl;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import de.anomic.server.serverObjects;
|
import de.anomic.server.serverObjects;
|
||||||
import de.anomic.server.serverSwitch;
|
import de.anomic.server.serverSwitch;
|
||||||
|
@ -95,7 +96,7 @@ public class ViewImage {
|
||||||
if (image == null) {
|
if (image == null) {
|
||||||
byte[] resourceb = null;
|
byte[] resourceb = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST);
|
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logFine("ViewImage", "cannot load: " + e.getMessage());
|
Log.logFine("ViewImage", "cannot load: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
|
|
||||||
import org.w3c.dom.Document;
|
import org.w3c.dom.Document;
|
||||||
|
@ -94,7 +95,7 @@ public class getpageinfo {
|
||||||
}
|
}
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (u != null) try {
|
if (u != null) try {
|
||||||
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
|
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
||||||
|
|
|
@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
|
|
||||||
import org.w3c.dom.Document;
|
import org.w3c.dom.Document;
|
||||||
|
@ -94,7 +95,7 @@ public class getpageinfo_p {
|
||||||
}
|
}
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (u != null) try {
|
if (u != null) try {
|
||||||
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
|
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
||||||
|
|
|
@ -97,7 +97,7 @@ public class webstructure {
|
||||||
prop.put("references", 1);
|
prop.put("references", 1);
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST);
|
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,6 +69,7 @@ import net.yacy.kelondro.util.SetTools;
|
||||||
import net.yacy.peers.EventChannel;
|
import net.yacy.peers.EventChannel;
|
||||||
import net.yacy.peers.NewsPool;
|
import net.yacy.peers.NewsPool;
|
||||||
import net.yacy.peers.graphics.ProfilingGraph;
|
import net.yacy.peers.graphics.ProfilingGraph;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.EventTracker;
|
import net.yacy.search.EventTracker;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.SwitchboardConstants;
|
import net.yacy.search.SwitchboardConstants;
|
||||||
|
@ -667,7 +668,7 @@ public class yacysearch {
|
||||||
sb.loader.request(urlentry.url(), true, false),
|
sb.loader.request(urlentry.url(), true, false),
|
||||||
CacheStrategy.IFEXIST,
|
CacheStrategy.IFEXIST,
|
||||||
5000,
|
5000,
|
||||||
Integer.MAX_VALUE);
|
Integer.MAX_VALUE, BlacklistType.SEARCH);
|
||||||
} catch ( final IOException e ) {
|
} catch ( final IOException e ) {
|
||||||
} catch ( final Parser.Failure e ) {
|
} catch ( final Parser.Failure e ) {
|
||||||
}
|
}
|
||||||
|
|
|
@ -183,7 +183,7 @@ public class yacysearchitem {
|
||||||
// END interaction
|
// END interaction
|
||||||
|
|
||||||
prop.putHTML("content_target", target);
|
prop.putHTML("content_target", target);
|
||||||
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10);
|
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null);
|
||||||
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
|
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
|
||||||
prop.put("content_urlhash", resulthashString);
|
prop.put("content_urlhash", resulthashString);
|
||||||
prop.put("content_ranking", result.ranking);
|
prop.put("content_ranking", result.ranking);
|
||||||
|
@ -266,7 +266,7 @@ public class yacysearchitem {
|
||||||
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
|
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
|
||||||
|
|
||||||
final String license = sb.licensedURLs.aquireLicense(ms.url());
|
final String license = sb.licensedURLs.aquireLicense(ms.url());
|
||||||
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10);
|
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null);
|
||||||
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
|
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
|
||||||
prop.putHTML("content_item_href", resultUrlstring);
|
prop.putHTML("content_item_href", resultUrlstring);
|
||||||
prop.putHTML("content_item_target", target);
|
prop.putHTML("content_item_target", target);
|
||||||
|
|
|
@ -49,6 +49,7 @@ import net.yacy.kelondro.workflow.WorkflowJob;
|
||||||
import net.yacy.peers.Protocol;
|
import net.yacy.peers.Protocol;
|
||||||
import net.yacy.peers.Seed;
|
import net.yacy.peers.Seed;
|
||||||
import net.yacy.peers.dht.PeerSelection;
|
import net.yacy.peers.dht.PeerSelection;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.Switchboard.indexingQueueEntry;
|
import net.yacy.search.Switchboard.indexingQueueEntry;
|
||||||
import net.yacy.search.SwitchboardConstants;
|
import net.yacy.search.SwitchboardConstants;
|
||||||
|
@ -655,7 +656,7 @@ public class CrawlQueues {
|
||||||
try {
|
try {
|
||||||
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
|
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
|
||||||
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
||||||
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), true);
|
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER);
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
||||||
if (CrawlQueues.this.log.isFine()) {
|
if (CrawlQueues.this.log.isFine()) {
|
||||||
|
|
|
@ -41,6 +41,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
import net.yacy.kelondro.order.Base64Order;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import de.anomic.crawler.retrieval.Response;
|
import de.anomic.crawler.retrieval.Response;
|
||||||
import de.anomic.data.WorkTables;
|
import de.anomic.data.WorkTables;
|
||||||
|
@ -62,7 +63,7 @@ public class RSSLoader extends Thread {
|
||||||
public void run() {
|
public void run() {
|
||||||
RSSReader rss = null;
|
RSSReader rss = null;
|
||||||
try {
|
try {
|
||||||
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
|
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER);
|
||||||
final byte[] resource = response == null ? null : response.getContent();
|
final byte[] resource = response == null ? null : response.getContent();
|
||||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||||
} catch (final MalformedURLException e) {
|
} catch (final MalformedURLException e) {
|
||||||
|
|
|
@ -69,14 +69,14 @@ public final class HTTPLoader {
|
||||||
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
|
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Response load(final Request entry, final int maxFileSize, final boolean checkBlacklist) throws IOException {
|
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
|
||||||
final long start = System.currentTimeMillis();
|
final long start = System.currentTimeMillis();
|
||||||
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist);
|
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType);
|
||||||
Latency.update(entry.url(), System.currentTimeMillis() - start);
|
Latency.update(entry.url(), System.currentTimeMillis() - start);
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException {
|
private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
|
||||||
|
|
||||||
byte[] myHash = this.sb.peers.mySeed().hash.getBytes();
|
byte[] myHash = this.sb.peers.mySeed().hash.getBytes();
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ public final class HTTPLoader {
|
||||||
|
|
||||||
// check if url is in blacklist
|
// check if url is in blacklist
|
||||||
final String hostlow = host.toLowerCase();
|
final String hostlow = host.toLowerCase();
|
||||||
if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
|
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
|
||||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||||
}
|
}
|
||||||
|
@ -175,7 +175,7 @@ public final class HTTPLoader {
|
||||||
|
|
||||||
// retry crawling with new url
|
// retry crawling with new url
|
||||||
request.redirectURL(redirectionUrl);
|
request.redirectURL(redirectionUrl);
|
||||||
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
|
return load(request, retryCount - 1, maxFileSize, blacklistType);
|
||||||
} else {
|
} else {
|
||||||
// we don't want to follow redirects
|
// we don't want to follow redirects
|
||||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||||
|
|
|
@ -69,7 +69,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
|
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url);
|
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url);
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -97,7 +97,7 @@ public class YMarkMetadata {
|
||||||
public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
|
public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
|
||||||
if(this.document == null) {
|
if(this.document == null) {
|
||||||
Response response = null;
|
Response response = null;
|
||||||
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
|
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
|
||||||
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
|
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
|
||||||
}
|
}
|
||||||
return this.document;
|
return this.document;
|
||||||
|
|
|
@ -62,7 +62,7 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
listFriends.putAll(moreFriends);
|
listFriends.putAll(moreFriends);
|
||||||
if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
|
if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
|
||||||
try {
|
try {
|
||||||
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE);
|
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null);
|
||||||
} catch (final MalformedURLException e) {
|
} catch (final MalformedURLException e) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -87,7 +87,7 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
Map<String, String> m;
|
Map<String, String> m;
|
||||||
for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
|
for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
|
||||||
if (!oaiFriend.getValue().exists()) {
|
if (!oaiFriend.getValue().exists()) {
|
||||||
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
|
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
|
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
}
|
}
|
||||||
return parser;
|
return parser;
|
||||||
}
|
}
|
||||||
|
|
||||||
// get a resumption token using a SAX xml parser from am input stream
|
// get a resumption token using a SAX xml parser from am input stream
|
||||||
public static class Parser extends DefaultHandler {
|
public static class Parser extends DefaultHandler {
|
||||||
|
|
||||||
|
@ -162,11 +162,12 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
<baseURL id="http://roar.eprints.org/id/eprint/1064">http://oai.repec.openlib.org/</baseURL>
|
<baseURL id="http://roar.eprints.org/id/eprint/1064">http://oai.repec.openlib.org/</baseURL>
|
||||||
</BaseURLs>
|
</BaseURLs>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public int getCounter() {
|
public int getCounter() {
|
||||||
return this.recordCounter;
|
return this.recordCounter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
||||||
if ("baseURL".equals(tag)) {
|
if ("baseURL".equals(tag)) {
|
||||||
this.recordCounter++;
|
this.recordCounter++;
|
||||||
|
@ -175,6 +176,7 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void endElement(final String uri, final String name, final String tag) {
|
public void endElement(final String uri, final String name, final String tag) {
|
||||||
if (tag == null) return;
|
if (tag == null) return;
|
||||||
if ("baseURL".equals(tag)) {
|
if ("baseURL".equals(tag)) {
|
||||||
|
@ -184,6 +186,7 @@ public class OAIListFriendsLoader implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void characters(final char ch[], final int start, final int length) {
|
public void characters(final char ch[], final int start, final int length) {
|
||||||
if (this.parsingValue) {
|
if (this.parsingValue) {
|
||||||
this.buffer.append(ch, start, length);
|
this.buffer.append(ch, start, length);
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class OAIPMHLoader {
|
||||||
for (int i = 0; i < 5; i++) {
|
for (int i = 0; i < 5; i++) {
|
||||||
// make some retries if first attempt fails
|
// make some retries if first attempt fails
|
||||||
try {
|
try {
|
||||||
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
|
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
|
||||||
break;
|
break;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true, false));
|
Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true, false));
|
||||||
|
|
|
@ -83,6 +83,7 @@ public class OSMTile {
|
||||||
public Place(final RasterPlotter m, final int xt, final int yt, final int xc, final int yc, final int z) {
|
public Place(final RasterPlotter m, final int xt, final int yt, final int xc, final int yc, final int z) {
|
||||||
this.m = m; this.xt = xt; this.yt = yt; this.xc = xc; this.yc = yc; this.z = z;
|
this.m = m; this.xt = xt; this.yt = yt; this.xc = xc; this.yc = yc; this.z = z;
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
final tileCoordinates t = new tileCoordinates(this.xt, this.yt, this.z);
|
final tileCoordinates t = new tileCoordinates(this.xt, this.yt, this.z);
|
||||||
BufferedImage bi = null;
|
BufferedImage bi = null;
|
||||||
|
@ -111,7 +112,7 @@ public class OSMTile {
|
||||||
// download resource using the crawler and keep resource in memory if possible
|
// download resource using the crawler and keep resource in memory if possible
|
||||||
Response entry = null;
|
Response entry = null;
|
||||||
try {
|
try {
|
||||||
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
|
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
|
Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -240,7 +240,7 @@ public final class yacyRelease extends yacyVersion {
|
||||||
try {
|
try {
|
||||||
final DigestURI uri = location.getLocationURL();
|
final DigestURI uri = location.getLocationURL();
|
||||||
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
|
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
|
||||||
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
|
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,9 +133,9 @@ public final class LoaderDispatcher {
|
||||||
0);
|
0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException {
|
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType) throws IOException {
|
||||||
|
|
||||||
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, true).getContent();
|
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType).getContent();
|
||||||
if (b == null) throw new IOException("load == null");
|
if (b == null) throw new IOException("load == null");
|
||||||
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
|
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
|
||||||
|
|
||||||
|
@ -146,11 +146,11 @@ public final class LoaderDispatcher {
|
||||||
tmp.renameTo(targetFile);
|
tmp.renameTo(targetFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
|
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType) throws IOException {
|
||||||
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
|
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException {
|
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
|
||||||
Semaphore check = this.loaderSteering.get(request.url());
|
Semaphore check = this.loaderSteering.get(request.url());
|
||||||
if (check != null) {
|
if (check != null) {
|
||||||
// a loading process may be going on for that url
|
// a loading process may be going on for that url
|
||||||
|
@ -161,7 +161,7 @@ public final class LoaderDispatcher {
|
||||||
|
|
||||||
this.loaderSteering.put(request.url(), new Semaphore(0));
|
this.loaderSteering.put(request.url(), new Semaphore(0));
|
||||||
try {
|
try {
|
||||||
final Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
|
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType);
|
||||||
check = this.loaderSteering.remove(request.url());
|
check = this.loaderSteering.remove(request.url());
|
||||||
if (check != null) check.release(1000);
|
if (check != null) check.release(1000);
|
||||||
return response;
|
return response;
|
||||||
|
@ -181,7 +181,7 @@ public final class LoaderDispatcher {
|
||||||
* @return the loaded entity in a Response object
|
* @return the loaded entity in a Response object
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException {
|
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
|
||||||
// get the protocol of the next URL
|
// get the protocol of the next URL
|
||||||
final DigestURI url = request.url();
|
final DigestURI url = request.url();
|
||||||
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
|
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
|
||||||
|
@ -189,7 +189,7 @@ public final class LoaderDispatcher {
|
||||||
final String host = url.getHost();
|
final String host = url.getHost();
|
||||||
|
|
||||||
// check if url is in blacklist
|
// check if url is in blacklist
|
||||||
if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, host.toLowerCase(), url.getFile())) {
|
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||||
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||||
}
|
}
|
||||||
|
@ -271,7 +271,7 @@ public final class LoaderDispatcher {
|
||||||
// load resource from the internet
|
// load resource from the internet
|
||||||
Response response = null;
|
Response response = null;
|
||||||
if (protocol.equals("http") || protocol.equals("https")) {
|
if (protocol.equals("http") || protocol.equals("https")) {
|
||||||
response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
|
response = this.httpLoader.load(request, maxFileSize, blacklistType);
|
||||||
} else if (protocol.equals("ftp")) {
|
} else if (protocol.equals("ftp")) {
|
||||||
response = this.ftpLoader.load(request, true);
|
response = this.ftpLoader.load(request, true);
|
||||||
} else if (protocol.equals("smb")) {
|
} else if (protocol.equals("smb")) {
|
||||||
|
@ -326,19 +326,19 @@ public final class LoaderDispatcher {
|
||||||
* @return the content as {@link byte[]}
|
* @return the content as {@link byte[]}
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
|
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException {
|
||||||
// try to download the resource using the loader
|
// try to download the resource using the loader
|
||||||
final Response entry = load(request, cacheStrategy, true);
|
final Response entry = load(request, cacheStrategy, blacklistType);
|
||||||
if (entry == null) return null; // not found in web
|
if (entry == null) return null; // not found in web
|
||||||
|
|
||||||
// read resource body (if it is there)
|
// read resource body (if it is there)
|
||||||
return entry.getContent();
|
return entry.getContent();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure {
|
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize, BlacklistType blacklistType) throws IOException, Parser.Failure {
|
||||||
|
|
||||||
// load resource
|
// load resource
|
||||||
final Response response = load(request, cacheStrategy, maxFileSize, true);
|
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType);
|
||||||
final DigestURI url = request.url();
|
final DigestURI url = request.url();
|
||||||
if (response == null) throw new IOException("no Response for url " + url);
|
if (response == null) throw new IOException("no Response for url " + url);
|
||||||
|
|
||||||
|
@ -349,10 +349,10 @@ public final class LoaderDispatcher {
|
||||||
return response.parse();
|
return response.parse();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
|
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType) throws IOException {
|
||||||
// load resource
|
// load resource
|
||||||
Request request = request(location, true, false);
|
Request request = request(location, true, false);
|
||||||
final Response response = this.load(request, cachePolicy, true);
|
final Response response = this.load(request, cachePolicy, blacklistType);
|
||||||
final DigestURI url = request.url();
|
final DigestURI url = request.url();
|
||||||
if (response == null) throw new IOException("no Response for url " + url);
|
if (response == null) throw new IOException("no Response for url " + url);
|
||||||
|
|
||||||
|
@ -375,8 +375,8 @@ public final class LoaderDispatcher {
|
||||||
* @return a map from URLs to the anchor texts of the urls
|
* @return a map from URLs to the anchor texts of the urls
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
|
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException {
|
||||||
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, true);
|
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType);
|
||||||
if (response == null) throw new IOException("response == null");
|
if (response == null) throw new IOException("response == null");
|
||||||
final ResponseHeader responseHeader = response.getResponseHeader();
|
final ResponseHeader responseHeader = response.getResponseHeader();
|
||||||
if (response.getContent() == null) throw new IOException("resource == null");
|
if (response.getContent() == null) throw new IOException("resource == null");
|
||||||
|
@ -405,12 +405,12 @@ public final class LoaderDispatcher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize) {
|
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType) {
|
||||||
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST).start();
|
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize) {
|
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType) {
|
||||||
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST).start();
|
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start();
|
||||||
}
|
}
|
||||||
|
|
||||||
private class Loader extends Thread {
|
private class Loader extends Thread {
|
||||||
|
@ -419,12 +419,14 @@ public final class LoaderDispatcher {
|
||||||
private final File cache;
|
private final File cache;
|
||||||
private final int maxFileSize;
|
private final int maxFileSize;
|
||||||
private final CacheStrategy cacheStrategy;
|
private final CacheStrategy cacheStrategy;
|
||||||
|
private final BlacklistType blacklistType;
|
||||||
|
|
||||||
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy) {
|
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.cache = cache;
|
this.cache = cache;
|
||||||
this.maxFileSize = maxFileSize;
|
this.maxFileSize = maxFileSize;
|
||||||
this.cacheStrategy = cacheStrategy;
|
this.cacheStrategy = cacheStrategy;
|
||||||
|
this.blacklistType = blacklistType;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -432,7 +434,7 @@ public final class LoaderDispatcher {
|
||||||
if (this.cache != null && this.cache.exists()) return;
|
if (this.cache != null && this.cache.exists()) return;
|
||||||
try {
|
try {
|
||||||
// load from the net
|
// load from the net
|
||||||
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, true);
|
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
if (this.cache != null) FileUtils.copy(b, this.cache);
|
if (this.cache != null) FileUtils.copy(b, this.cache);
|
||||||
} catch (final MalformedURLException e) {} catch (final IOException e) {}
|
} catch (final MalformedURLException e) {} catch (final IOException e) {}
|
||||||
|
|
|
@ -141,6 +141,7 @@ import net.yacy.peers.operation.yacyBuildProperties;
|
||||||
import net.yacy.peers.operation.yacyRelease;
|
import net.yacy.peers.operation.yacyRelease;
|
||||||
import net.yacy.peers.operation.yacyUpdateLocation;
|
import net.yacy.peers.operation.yacyUpdateLocation;
|
||||||
import net.yacy.repository.Blacklist;
|
import net.yacy.repository.Blacklist;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.repository.FilterEngine;
|
import net.yacy.repository.FilterEngine;
|
||||||
import net.yacy.repository.LoaderDispatcher;
|
import net.yacy.repository.LoaderDispatcher;
|
||||||
import net.yacy.search.index.Segment;
|
import net.yacy.search.index.Segment;
|
||||||
|
@ -2746,7 +2747,7 @@ public final class Switchboard extends serverSwitch
|
||||||
Thread.currentThread().setName("Switchboard.addToIndex:" + urls);
|
Thread.currentThread().setName("Switchboard.addToIndex:" + urls);
|
||||||
try {
|
try {
|
||||||
final Response response =
|
final Response response =
|
||||||
Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, true);
|
Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
|
||||||
if ( response == null ) {
|
if ( response == null ) {
|
||||||
throw new IOException("response == null");
|
throw new IOException("response == null");
|
||||||
}
|
}
|
||||||
|
@ -3173,7 +3174,7 @@ public final class Switchboard extends serverSwitch
|
||||||
final Map<MultiProtocolURI, String> links;
|
final Map<MultiProtocolURI, String> links;
|
||||||
searchEvent.getRankingResult().oneFeederStarted();
|
searchEvent.getRankingResult().oneFeederStarted();
|
||||||
try {
|
try {
|
||||||
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
|
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH);
|
||||||
if ( links != null ) {
|
if ( links != null ) {
|
||||||
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
||||||
while ( i.hasNext() ) {
|
while ( i.hasNext() ) {
|
||||||
|
@ -3212,7 +3213,7 @@ public final class Switchboard extends serverSwitch
|
||||||
final Map<MultiProtocolURI, String> links;
|
final Map<MultiProtocolURI, String> links;
|
||||||
DigestURI url;
|
DigestURI url;
|
||||||
try {
|
try {
|
||||||
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
|
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH);
|
||||||
if (links != null) {
|
if (links != null) {
|
||||||
if (links.size() < 1000) { // limit to 1000 to skip large index pages
|
if (links.size() < 1000) { // limit to 1000 to skip large index pages
|
||||||
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
||||||
|
@ -3276,7 +3277,7 @@ public final class Switchboard extends serverSwitch
|
||||||
searchEvent.getRankingResult().oneFeederStarted();
|
searchEvent.getRankingResult().oneFeederStarted();
|
||||||
try {
|
try {
|
||||||
final Response response =
|
final Response response =
|
||||||
sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, true);
|
sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH);
|
||||||
final byte[] resource = (response == null) ? null : response.getContent();
|
final byte[] resource = (response == null) ? null : response.getContent();
|
||||||
//System.out.println("BLEKKO: " + UTF8.String(resource));
|
//System.out.println("BLEKKO: " + UTF8.String(resource));
|
||||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||||
|
|
|
@ -538,7 +538,7 @@ public class Segment {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// parse the resource
|
// parse the resource
|
||||||
final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE));
|
final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE, null));
|
||||||
if (document == null) {
|
if (document == null) {
|
||||||
// delete just the url entry
|
// delete just the url entry
|
||||||
urlMetadata().remove(urlhash);
|
urlMetadata().remove(urlhash);
|
||||||
|
|
|
@ -142,7 +142,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
||||||
|
|
||||||
Document document;
|
Document document;
|
||||||
try {
|
try {
|
||||||
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Integer.MAX_VALUE));
|
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Integer.MAX_VALUE, BlacklistType.SEARCH));
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logFine("snippet fetch", "load error: " + e.getMessage());
|
Log.logFine("snippet fetch", "load error: " + e.getMessage());
|
||||||
return new ArrayList<MediaSnippet>();
|
return new ArrayList<MediaSnippet>();
|
||||||
|
|
|
@ -53,6 +53,7 @@ import net.yacy.kelondro.order.Base64Order;
|
||||||
import net.yacy.kelondro.util.ByteArray;
|
import net.yacy.kelondro.util.ByteArray;
|
||||||
import net.yacy.kelondro.util.ByteBuffer;
|
import net.yacy.kelondro.util.ByteBuffer;
|
||||||
import net.yacy.peers.RemoteSearch;
|
import net.yacy.peers.RemoteSearch;
|
||||||
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.repository.LoaderDispatcher;
|
import net.yacy.repository.LoaderDispatcher;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import de.anomic.crawler.retrieval.Request;
|
import de.anomic.crawler.retrieval.Request;
|
||||||
|
@ -209,7 +210,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
||||||
final Request request = loader == null ? null : loader.request(url, true, reindexing);
|
final Request request = loader == null ? null : loader.request(url, true, reindexing);
|
||||||
Response response;
|
Response response;
|
||||||
try {
|
try {
|
||||||
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, true);
|
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH);
|
||||||
} catch (IOException e1) {
|
} catch (IOException e1) {
|
||||||
response = null;
|
response = null;
|
||||||
}
|
}
|
||||||
|
@ -242,7 +243,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
||||||
// try to load the resource from the cache
|
// try to load the resource from the cache
|
||||||
Response response = null;
|
Response response = null;
|
||||||
try {
|
try {
|
||||||
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB() || cacheStrategy == null) ? CacheStrategy.NOCACHE : cacheStrategy, true);
|
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB() || cacheStrategy == null) ? CacheStrategy.NOCACHE : cacheStrategy, BlacklistType.SEARCH);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
response = null;
|
response = null;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user