mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- refactoring of robots
- added option to crawler to send error-URLs to solr - changed solr scheme slightly (no multi-value fields where no multi values are) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7693 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
1ea0bc775c
commit
6fa439c82b
|
@ -385,7 +385,7 @@ public class Crawler_p {
|
||||||
sb.peers.mySeed().hash.getBytes(),
|
sb.peers.mySeed().hash.getBytes(),
|
||||||
new Date(),
|
new Date(),
|
||||||
1,
|
1,
|
||||||
reasonString);
|
reasonString, -1);
|
||||||
}
|
}
|
||||||
} catch (final PatternSyntaxException e) {
|
} catch (final PatternSyntaxException e) {
|
||||||
prop.put("info", "4"); // crawlfilter does not match url
|
prop.put("info", "4"); // crawlfilter does not match url
|
||||||
|
|
|
@ -105,7 +105,7 @@ public class WebStructurePicture_p {
|
||||||
} else {
|
} else {
|
||||||
// find start hash
|
// find start hash
|
||||||
String hash = null;
|
String hash = null;
|
||||||
try {
|
if (host != null && host.length() > 0) try {
|
||||||
hash = UTF8.String((new DigestURI("http://" + host)).hash(), 6, 6);
|
hash = UTF8.String((new DigestURI("http://" + host)).hash(), 6, 6);
|
||||||
} catch (final MalformedURLException e) {Log.logException(e);}
|
} catch (final MalformedURLException e) {Log.logException(e);}
|
||||||
//assert (sb.webStructure.outgoingReferences(hash) != null);
|
//assert (sb.webStructure.outgoingReferences(hash) != null);
|
||||||
|
|
|
@ -9,7 +9,7 @@ import net.yacy.document.parser.html.ContentScraper;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
|
||||||
import de.anomic.crawler.CrawlProfile;
|
import de.anomic.crawler.CrawlProfile;
|
||||||
import de.anomic.crawler.RobotsEntry;
|
import de.anomic.crawler.RobotsTxtEntry;
|
||||||
import de.anomic.search.Switchboard;
|
import de.anomic.search.Switchboard;
|
||||||
import de.anomic.server.serverObjects;
|
import de.anomic.server.serverObjects;
|
||||||
import de.anomic.server.serverSwitch;
|
import de.anomic.server.serverSwitch;
|
||||||
|
@ -106,7 +106,7 @@ public class getpageinfo_p {
|
||||||
final DigestURI theURL = new DigestURI(url);
|
final DigestURI theURL = new DigestURI(url);
|
||||||
|
|
||||||
// determine if crawling of the current URL is allowed
|
// determine if crawling of the current URL is allowed
|
||||||
RobotsEntry robotsEntry;
|
RobotsTxtEntry robotsEntry;
|
||||||
try {
|
try {
|
||||||
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
|
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -162,7 +162,7 @@ public final class crawlReceipt {
|
||||||
youare.getBytes(),
|
youare.getBytes(),
|
||||||
null,
|
null,
|
||||||
0,
|
0,
|
||||||
result + ":" + reason);
|
result + ":" + reason, -1);
|
||||||
//switchboard.noticeURL.remove(receivedUrlhash);
|
//switchboard.noticeURL.remove(receivedUrlhash);
|
||||||
prop.put("delay", "3600");
|
prop.put("delay", "3600");
|
||||||
return prop;
|
return prop;
|
||||||
|
|
|
@ -85,7 +85,8 @@ public class urls {
|
||||||
sb.peers.mySeed().hash.getBytes(),
|
sb.peers.mySeed().hash.getBytes(),
|
||||||
new Date(),
|
new Date(),
|
||||||
0,
|
0,
|
||||||
"client=____________");
|
"client=____________",
|
||||||
|
-1);
|
||||||
|
|
||||||
// create RSS entry
|
// create RSS entry
|
||||||
prop.put("item_" + c + "_title", "");
|
prop.put("item_" + c + "_title", "");
|
||||||
|
|
|
@ -80,8 +80,8 @@ public class CrawlQueues {
|
||||||
log.logConfig("Starting Crawling Management");
|
log.logConfig("Starting Crawling Management");
|
||||||
noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
||||||
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
|
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
|
||||||
errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||||
delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void relocate(final File newQueuePath) {
|
public void relocate(final File newQueuePath) {
|
||||||
|
@ -92,8 +92,8 @@ public class CrawlQueues {
|
||||||
|
|
||||||
noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
|
||||||
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
|
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
|
||||||
errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
errorURL = new ZURL(sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||||
delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
delegatedURL = new ZURL(sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
|
@ -571,7 +571,7 @@ public class CrawlQueues {
|
||||||
try {
|
try {
|
||||||
// checking robots.txt for http(s) resources
|
// checking robots.txt for http(s) resources
|
||||||
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
|
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
|
||||||
RobotsEntry robotsEntry;
|
RobotsTxtEntry robotsEntry;
|
||||||
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
|
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
|
||||||
(robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
|
(robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
|
||||||
robotsEntry.isDisallowed(request.url())) {
|
robotsEntry.isDisallowed(request.url())) {
|
||||||
|
@ -581,7 +581,7 @@ public class CrawlQueues {
|
||||||
UTF8.getBytes(sb.peers.mySeed().hash),
|
UTF8.getBytes(sb.peers.mySeed().hash),
|
||||||
new Date(),
|
new Date(),
|
||||||
1,
|
1,
|
||||||
"denied by robots.txt");
|
"denied by robots.txt", -1);
|
||||||
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
|
||||||
} else {
|
} else {
|
||||||
// starting a load from the internet
|
// starting a load from the internet
|
||||||
|
@ -617,7 +617,7 @@ public class CrawlQueues {
|
||||||
UTF8.getBytes(sb.peers.mySeed().hash),
|
UTF8.getBytes(sb.peers.mySeed().hash),
|
||||||
new Date(),
|
new Date(),
|
||||||
1,
|
1,
|
||||||
"cannot load: " + result);
|
"cannot load: " + result, -1);
|
||||||
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
|
||||||
} else {
|
} else {
|
||||||
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
|
||||||
|
@ -629,7 +629,7 @@ public class CrawlQueues {
|
||||||
UTF8.getBytes(sb.peers.mySeed().hash),
|
UTF8.getBytes(sb.peers.mySeed().hash),
|
||||||
new Date(),
|
new Date(),
|
||||||
1,
|
1,
|
||||||
e.getMessage() + " - in worker");
|
e.getMessage() + " - in worker", -1);
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
// Client.initConnectionManager();
|
// Client.initConnectionManager();
|
||||||
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
|
||||||
|
|
|
@ -202,7 +202,7 @@ public final class CrawlStacker {
|
||||||
|
|
||||||
// if the url was rejected we store it into the error URL db
|
// if the url was rejected we store it into the error URL db
|
||||||
if (rejectReason != null) {
|
if (rejectReason != null) {
|
||||||
nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason);
|
nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason, -1);
|
||||||
}
|
}
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
|
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
|
||||||
|
@ -469,9 +469,9 @@ public final class CrawlStacker {
|
||||||
}
|
}
|
||||||
|
|
||||||
// deny cgi
|
// deny cgi
|
||||||
if (url.isIndividual()) {
|
if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual
|
||||||
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
|
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
|
||||||
return "cgi url not allowed";
|
return "individual url (sessionid etc) not wanted";
|
||||||
}
|
}
|
||||||
|
|
||||||
// deny post properties
|
// deny post properties
|
||||||
|
|
|
@ -186,7 +186,7 @@ public class Latency {
|
||||||
// find the delay as given by robots.txt on target site
|
// find the delay as given by robots.txt on target site
|
||||||
long robotsDelay = 0;
|
long robotsDelay = 0;
|
||||||
if (!local) {
|
if (!local) {
|
||||||
RobotsEntry robotsEntry;
|
RobotsTxtEntry robotsEntry;
|
||||||
try {
|
try {
|
||||||
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -239,7 +239,7 @@ public class Latency {
|
||||||
// find the delay as given by robots.txt on target site
|
// find the delay as given by robots.txt on target site
|
||||||
long robotsDelay = 0;
|
long robotsDelay = 0;
|
||||||
if (!local) {
|
if (!local) {
|
||||||
RobotsEntry robotsEntry;
|
RobotsTxtEntry robotsEntry;
|
||||||
try {
|
try {
|
||||||
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -11,16 +11,16 @@
|
||||||
//Revision: $LastChangedRevision$
|
//Revision: $LastChangedRevision$
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or modify
|
//This program is free software; you can redistribute it and/or modify
|
||||||
//it under the terms of the GNU General Public License as published by
|
//it under the terms of the GNU General public License as published by
|
||||||
//the Free Software Foundation; either version 2 of the License, or
|
//the Free Software Foundation; either version 2 of the License, or
|
||||||
//(at your option) any later version.
|
//(at your option) any later version.
|
||||||
//
|
//
|
||||||
//This program is distributed in the hope that it will be useful,
|
//This program is distributed in the hope that it will be useful,
|
||||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
//GNU General Public License for more details.
|
//GNU General public License for more details.
|
||||||
//
|
//
|
||||||
//You should have received a copy of the GNU General Public License
|
//You should have received a copy of the GNU General public License
|
||||||
//along with this program; if not, write to the Free Software
|
//along with this program; if not, write to the Free Software
|
||||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
@ -51,15 +51,15 @@ public class RobotsTxt {
|
||||||
|
|
||||||
private static Logger log = Logger.getLogger(RobotsTxt.class);
|
private static Logger log = Logger.getLogger(RobotsTxt.class);
|
||||||
|
|
||||||
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
|
protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
|
||||||
public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
|
protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
|
||||||
|
|
||||||
BEncodedHeap robotsTable;
|
BEncodedHeap robotsTable;
|
||||||
private final ConcurrentHashMap<String, DomSync> syncObjects;
|
private final ConcurrentHashMap<String, DomSync> syncObjects;
|
||||||
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
|
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
|
||||||
|
|
||||||
private static class DomSync {
|
private static class DomSync {
|
||||||
public DomSync() {}
|
private DomSync() {}
|
||||||
}
|
}
|
||||||
|
|
||||||
public RobotsTxt(final BEncodedHeap robotsTable) {
|
public RobotsTxt(final BEncodedHeap robotsTable) {
|
||||||
|
@ -78,16 +78,16 @@ public class RobotsTxt {
|
||||||
return this.robotsTable.size();
|
return this.robotsTable.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
|
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
|
||||||
if (theURL == null) throw new IllegalArgumentException();
|
if (theURL == null) throw new IllegalArgumentException();
|
||||||
if (!theURL.getProtocol().startsWith("http")) return null;
|
if (!theURL.getProtocol().startsWith("http")) return null;
|
||||||
return getEntry(theURL, thisAgents, true);
|
return getEntry(theURL, thisAgents, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
|
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
|
||||||
// this method will always return a non-null value
|
// this method will always return a non-null value
|
||||||
String urlHostPort = getHostPort(theURL);
|
String urlHostPort = getHostPort(theURL);
|
||||||
RobotsEntry robotsTxt4Host = null;
|
RobotsTxtEntry robotsTxt4Host = null;
|
||||||
Map<String, byte[]> record;
|
Map<String, byte[]> record;
|
||||||
try {
|
try {
|
||||||
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
|
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
|
||||||
|
@ -95,7 +95,7 @@ public class RobotsTxt {
|
||||||
log.warn("memory exhausted", e);
|
log.warn("memory exhausted", e);
|
||||||
record = null;
|
record = null;
|
||||||
}
|
}
|
||||||
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
|
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
|
||||||
|
|
||||||
if (fetchOnlineIfNotAvailableOrNotFresh && (
|
if (fetchOnlineIfNotAvailableOrNotFresh && (
|
||||||
robotsTxt4Host == null ||
|
robotsTxt4Host == null ||
|
||||||
|
@ -123,7 +123,7 @@ public class RobotsTxt {
|
||||||
log.warn("memory exhausted", e);
|
log.warn("memory exhausted", e);
|
||||||
record = null;
|
record = null;
|
||||||
}
|
}
|
||||||
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
|
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
|
||||||
if (robotsTxt4Host != null &&
|
if (robotsTxt4Host != null &&
|
||||||
robotsTxt4Host.getLoadedDate() != null &&
|
robotsTxt4Host.getLoadedDate() != null &&
|
||||||
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
|
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
|
||||||
|
@ -160,7 +160,7 @@ public class RobotsTxt {
|
||||||
// no robots.txt available, make an entry to prevent that the robots loading is done twice
|
// no robots.txt available, make an entry to prevent that the robots loading is done twice
|
||||||
if (robotsTxt4Host == null) {
|
if (robotsTxt4Host == null) {
|
||||||
// generate artificial entry
|
// generate artificial entry
|
||||||
robotsTxt4Host = new RobotsEntry(
|
robotsTxt4Host = new RobotsTxtEntry(
|
||||||
robotsURL,
|
robotsURL,
|
||||||
new ArrayList<String>(),
|
new ArrayList<String>(),
|
||||||
new ArrayList<String>(),
|
new ArrayList<String>(),
|
||||||
|
@ -183,7 +183,7 @@ public class RobotsTxt {
|
||||||
addEntry(robotsTxt4Host);
|
addEntry(robotsTxt4Host);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
|
final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
|
||||||
ArrayList<String> denyPath = parserResult.denyList();
|
ArrayList<String> denyPath = parserResult.denyList();
|
||||||
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
|
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
|
||||||
denyPath = new ArrayList<String>();
|
denyPath = new ArrayList<String>();
|
||||||
|
@ -208,7 +208,7 @@ public class RobotsTxt {
|
||||||
return robotsTxt4Host;
|
return robotsTxt4Host;
|
||||||
}
|
}
|
||||||
|
|
||||||
private RobotsEntry addEntry(
|
private RobotsTxtEntry addEntry(
|
||||||
final MultiProtocolURI theURL,
|
final MultiProtocolURI theURL,
|
||||||
final ArrayList<String> allowPathList,
|
final ArrayList<String> allowPathList,
|
||||||
final ArrayList<String> denyPathList,
|
final ArrayList<String> denyPathList,
|
||||||
|
@ -219,7 +219,7 @@ public class RobotsTxt {
|
||||||
final long crawlDelayMillis,
|
final long crawlDelayMillis,
|
||||||
final String agentName
|
final String agentName
|
||||||
) {
|
) {
|
||||||
final RobotsEntry entry = new RobotsEntry(
|
final RobotsTxtEntry entry = new RobotsTxtEntry(
|
||||||
theURL, allowPathList, denyPathList,
|
theURL, allowPathList, denyPathList,
|
||||||
loadedDate, modDate,
|
loadedDate, modDate,
|
||||||
eTag, sitemap, crawlDelayMillis, agentName);
|
eTag, sitemap, crawlDelayMillis, agentName);
|
||||||
|
@ -227,7 +227,7 @@ public class RobotsTxt {
|
||||||
return entry;
|
return entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String addEntry(final RobotsEntry entry) {
|
private String addEntry(final RobotsTxtEntry entry) {
|
||||||
// writes a new page and returns key
|
// writes a new page and returns key
|
||||||
try {
|
try {
|
||||||
this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
|
this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
|
||||||
|
@ -240,10 +240,10 @@ public class RobotsTxt {
|
||||||
|
|
||||||
// methods that had been in robotsParser.java:
|
// methods that had been in robotsParser.java:
|
||||||
|
|
||||||
public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
|
private static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
|
||||||
public static final int DOWNLOAD_ROBOTS_TXT = 1;
|
private static final int DOWNLOAD_ROBOTS_TXT = 1;
|
||||||
public static final int DOWNLOAD_ETAG = 2;
|
private static final int DOWNLOAD_ETAG = 2;
|
||||||
public static final int DOWNLOAD_MODDATE = 3;
|
private static final int DOWNLOAD_MODDATE = 3;
|
||||||
|
|
||||||
static final String getHostPort(final MultiProtocolURI theURL) {
|
static final String getHostPort(final MultiProtocolURI theURL) {
|
||||||
String urlHostPort = null;
|
String urlHostPort = null;
|
||||||
|
@ -267,7 +267,7 @@ public class RobotsTxt {
|
||||||
return port;
|
return port;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
|
private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception {
|
||||||
if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
|
if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
|
||||||
|
|
||||||
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
|
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
|
||||||
|
|
|
@ -13,16 +13,16 @@
|
||||||
//Revision: $LastChangedRevision$
|
//Revision: $LastChangedRevision$
|
||||||
//
|
//
|
||||||
//This program is free software; you can redistribute it and/or modify
|
//This program is free software; you can redistribute it and/or modify
|
||||||
//it under the terms of the GNU General Public License as published by
|
//it under the terms of the GNU General public License as published by
|
||||||
//the Free Software Foundation; either version 2 of the License, or
|
//the Free Software Foundation; either version 2 of the License, or
|
||||||
//(at your option) any later version.
|
//(at your option) any later version.
|
||||||
//
|
//
|
||||||
//This program is distributed in the hope that it will be useful,
|
//This program is distributed in the hope that it will be useful,
|
||||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
//GNU General Public License for more details.
|
//GNU General public License for more details.
|
||||||
//
|
//
|
||||||
//You should have received a copy of the GNU General Public License
|
//You should have received a copy of the GNU General public License
|
||||||
//along with this program; if not, write to the Free Software
|
//along with this program; if not, write to the Free Software
|
||||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
@ -41,25 +41,25 @@ import net.yacy.cora.document.UTF8;
|
||||||
import net.yacy.kelondro.util.ByteArray;
|
import net.yacy.kelondro.util.ByteArray;
|
||||||
|
|
||||||
|
|
||||||
public class RobotsEntry {
|
public class RobotsTxtEntry {
|
||||||
|
|
||||||
public static final String HOST_NAME = "hostname";
|
private static final String HOST_NAME = "hostname";
|
||||||
public static final String ALLOW_PATH_LIST = "allow";
|
private static final String ALLOW_PATH_LIST = "allow";
|
||||||
public static final String DISALLOW_PATH_LIST = "disallow";
|
private static final String DISALLOW_PATH_LIST = "disallow";
|
||||||
public static final String LOADED_DATE = "date";
|
private static final String LOADED_DATE = "date";
|
||||||
public static final String MOD_DATE = "modDate";
|
private static final String MOD_DATE = "modDate";
|
||||||
public static final String ETAG = "etag";
|
private static final String ETAG = "etag";
|
||||||
public static final String SITEMAP = "sitemap";
|
private static final String SITEMAP = "sitemap";
|
||||||
public static final String CRAWL_DELAY = "crawlDelay";
|
private static final String CRAWL_DELAY = "crawlDelay";
|
||||||
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
|
private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
|
||||||
public static final String AGENT_NAME = "agentname";
|
private static final String AGENT_NAME = "agentname";
|
||||||
|
|
||||||
// this is a simple record structure that holds all properties of a single crawl start
|
// this is a simple record structure that holds all properties of a single crawl start
|
||||||
private final Map<String, byte[]> mem;
|
private final Map<String, byte[]> mem;
|
||||||
private final List<String> allowPathList, denyPathList;
|
private final List<String> allowPathList, denyPathList;
|
||||||
private final String hostName, agentName;
|
private final String hostName, agentName;
|
||||||
|
|
||||||
public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
|
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
|
||||||
this.hostName = hostName.toLowerCase();
|
this.hostName = hostName.toLowerCase();
|
||||||
this.mem = mem;
|
this.mem = mem;
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ public class RobotsEntry {
|
||||||
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
|
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RobotsEntry(
|
protected RobotsTxtEntry(
|
||||||
final MultiProtocolURI theURL,
|
final MultiProtocolURI theURL,
|
||||||
final List<String> allowPathList,
|
final List<String> allowPathList,
|
||||||
final List<String> disallowPathList,
|
final List<String> disallowPathList,
|
||||||
|
@ -140,15 +140,15 @@ public class RobotsEntry {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getHostName() {
|
protected String getHostName() {
|
||||||
return this.hostName;
|
return this.hostName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getAgentName() {
|
protected String getAgentName() {
|
||||||
return this.agentName;
|
return this.agentName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, byte[]> getMem() {
|
protected Map<String, byte[]> getMem() {
|
||||||
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
|
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
|
||||||
return this.mem;
|
return this.mem;
|
||||||
}
|
}
|
||||||
|
@ -175,34 +175,34 @@ public class RobotsEntry {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getLoadedDate() {
|
protected Date getLoadedDate() {
|
||||||
if (this.mem.containsKey(LOADED_DATE)) {
|
if (this.mem.containsKey(LOADED_DATE)) {
|
||||||
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
|
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLoadedDate(final Date newLoadedDate) {
|
protected void setLoadedDate(final Date newLoadedDate) {
|
||||||
if (newLoadedDate != null) {
|
if (newLoadedDate != null) {
|
||||||
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
|
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getModDate() {
|
protected Date getModDate() {
|
||||||
if (this.mem.containsKey(MOD_DATE)) {
|
if (this.mem.containsKey(MOD_DATE)) {
|
||||||
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
|
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getETag() {
|
protected String getETag() {
|
||||||
if (this.mem.containsKey(ETAG)) {
|
if (this.mem.containsKey(ETAG)) {
|
||||||
return UTF8.String(this.mem.get(ETAG));
|
return UTF8.String(this.mem.get(ETAG));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getCrawlDelayMillis() {
|
protected long getCrawlDelayMillis() {
|
||||||
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
|
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
|
||||||
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
|
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
|
||||||
} catch (final NumberFormatException e) {
|
} catch (final NumberFormatException e) {
|
|
@ -10,16 +10,16 @@
|
||||||
Revision: $LastChangedRevision$
|
Revision: $LastChangedRevision$
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General public License as published by
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
(at your option) any later version.
|
(at your option) any later version.
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
This program is distributed in the hope that it will be useful,
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
GNU General Public License for more details.
|
GNU General public License for more details.
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General private License
|
||||||
along with this program; if not, write to the Free Software
|
along with this program; if not, write to the Free Software
|
||||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
@ -59,16 +59,16 @@ import java.util.regex.Pattern;
|
||||||
* See: http://www.kollar.com/robots.html
|
* See: http://www.kollar.com/robots.html
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final class robotsParser {
|
public final class RobotsTxtParser {
|
||||||
|
|
||||||
private static final Pattern patternTab = Pattern.compile("\t");
|
private static final Pattern patternTab = Pattern.compile("\t");
|
||||||
|
|
||||||
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
|
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
|
||||||
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
|
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
|
||||||
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
|
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
|
||||||
public static final String ROBOTS_COMMENT = "#";
|
private static final String ROBOTS_COMMENT = "#";
|
||||||
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
|
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
|
||||||
public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
|
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
|
||||||
|
|
||||||
private final ArrayList<String> allowList;
|
private final ArrayList<String> allowList;
|
||||||
private final ArrayList<String> denyList;
|
private final ArrayList<String> denyList;
|
||||||
|
@ -77,7 +77,7 @@ public final class robotsParser {
|
||||||
private final Set<String> myNames; // a list of own name lists
|
private final Set<String> myNames; // a list of own name lists
|
||||||
private String agentName; // the name of the agent that was used to return the result
|
private String agentName; // the name of the agent that was used to return the result
|
||||||
|
|
||||||
public robotsParser(final byte[] robotsTxt, final Set<String> myNames) {
|
protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
|
||||||
this.allowList = new ArrayList<String>(0);
|
this.allowList = new ArrayList<String>(0);
|
||||||
this.denyList = new ArrayList<String>(0);
|
this.denyList = new ArrayList<String>(0);
|
||||||
this.sitemap = "";
|
this.sitemap = "";
|
||||||
|
@ -91,16 +91,6 @@ public final class robotsParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public robotsParser(final BufferedReader reader, final Set<String> myNames) {
|
|
||||||
this.allowList = new ArrayList<String>(0);
|
|
||||||
this.denyList = new ArrayList<String>(0);
|
|
||||||
this.sitemap = "";
|
|
||||||
this.crawlDelayMillis = 0;
|
|
||||||
this.myNames = myNames;
|
|
||||||
this.agentName = null;
|
|
||||||
if (reader != null) parse(reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parse(final BufferedReader reader) {
|
private void parse(final BufferedReader reader) {
|
||||||
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
|
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
|
||||||
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
|
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
|
||||||
|
@ -260,7 +250,7 @@ public final class robotsParser {
|
||||||
* does not make any no-DOS-forced crawl pause.
|
* does not make any no-DOS-forced crawl pause.
|
||||||
* @return the crawl delay between two crawl access times in milliseconds
|
* @return the crawl delay between two crawl access times in milliseconds
|
||||||
*/
|
*/
|
||||||
public long crawlDelayMillis() {
|
protected long crawlDelayMillis() {
|
||||||
return this.crawlDelayMillis;
|
return this.crawlDelayMillis;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -271,19 +261,19 @@ public final class robotsParser {
|
||||||
* Effects: see also comment to crawlDelayMillis()
|
* Effects: see also comment to crawlDelayMillis()
|
||||||
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
|
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
|
||||||
*/
|
*/
|
||||||
public String agentName() {
|
protected String agentName() {
|
||||||
return this.agentName;
|
return this.agentName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String sitemap() {
|
protected String sitemap() {
|
||||||
return this.sitemap;
|
return this.sitemap;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ArrayList<String> allowList() {
|
protected ArrayList<String> allowList() {
|
||||||
return this.allowList;
|
return this.allowList;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ArrayList<String> denyList() {
|
protected ArrayList<String> denyList() {
|
||||||
return this.denyList;
|
return this.denyList;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -34,6 +34,7 @@ import java.util.Iterator;
|
||||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
|
||||||
import net.yacy.cora.document.UTF8;
|
import net.yacy.cora.document.UTF8;
|
||||||
|
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.data.word.Word;
|
import net.yacy.kelondro.data.word.Word;
|
||||||
import net.yacy.kelondro.index.Index;
|
import net.yacy.kelondro.index.Index;
|
||||||
|
@ -66,13 +67,16 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
||||||
// the class object
|
// the class object
|
||||||
private Index urlIndex;
|
private Index urlIndex;
|
||||||
private final ConcurrentLinkedQueue<byte[]> stack;
|
private final ConcurrentLinkedQueue<byte[]> stack;
|
||||||
|
private final SolrSingleConnector solrConnector;
|
||||||
|
|
||||||
public ZURL(
|
public ZURL(
|
||||||
|
final SolrSingleConnector solrConnector,
|
||||||
final File cachePath,
|
final File cachePath,
|
||||||
final String tablename,
|
final String tablename,
|
||||||
final boolean startWithEmptyFile,
|
final boolean startWithEmptyFile,
|
||||||
final boolean useTailCache,
|
final boolean useTailCache,
|
||||||
final boolean exceed134217727) {
|
final boolean exceed134217727) {
|
||||||
|
this.solrConnector = solrConnector;
|
||||||
// creates a new ZURL in a file
|
// creates a new ZURL in a file
|
||||||
cachePath.mkdirs();
|
cachePath.mkdirs();
|
||||||
final File f = new File(cachePath, tablename);
|
final File f = new File(cachePath, tablename);
|
||||||
|
@ -94,7 +98,8 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
||||||
this.stack = new ConcurrentLinkedQueue<byte[]>();
|
this.stack = new ConcurrentLinkedQueue<byte[]>();
|
||||||
}
|
}
|
||||||
|
|
||||||
public ZURL() {
|
public ZURL(final SolrSingleConnector solrConnector) {
|
||||||
|
this.solrConnector = solrConnector;
|
||||||
// creates a new ZUR in RAM
|
// creates a new ZUR in RAM
|
||||||
this.urlIndex = new RowSet(rowdef);
|
this.urlIndex = new RowSet(rowdef);
|
||||||
this.stack = new ConcurrentLinkedQueue<byte[]>();
|
this.stack = new ConcurrentLinkedQueue<byte[]>();
|
||||||
|
@ -126,14 +131,24 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
||||||
final byte[] executor,
|
final byte[] executor,
|
||||||
final Date workdate,
|
final Date workdate,
|
||||||
final int workcount,
|
final int workcount,
|
||||||
String anycause) {
|
String anycause,
|
||||||
|
int httpcode) {
|
||||||
// assert executor != null; // null == proxy !
|
// assert executor != null; // null == proxy !
|
||||||
if (exists(bentry.url().hash())) return; // don't insert double causes
|
if (exists(bentry.url().hash())) return; // don't insert double causes
|
||||||
if (anycause == null) anycause = "unknown";
|
if (anycause == null) anycause = "unknown";
|
||||||
Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
|
String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
|
||||||
|
Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
|
||||||
put(entry);
|
put(entry);
|
||||||
stack.add(entry.hash());
|
stack.add(entry.hash());
|
||||||
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + anycause);
|
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
|
||||||
|
if (this.solrConnector != null) {
|
||||||
|
// send the error to solr
|
||||||
|
try {
|
||||||
|
this.solrConnector.err(bentry.url(), reason, httpcode);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
while (stack.size() > maxStackSize) stack.poll();
|
while (stack.size() > maxStackSize) stack.poll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,7 @@ public class FTPLoader {
|
||||||
if (berr.size() > 0 || response == null) {
|
if (berr.size() > 0 || response == null) {
|
||||||
// some error logging
|
// some error logging
|
||||||
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
|
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail, -1);
|
||||||
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
|
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -78,7 +78,7 @@ public final class HTTPLoader {
|
||||||
private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {
|
private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {
|
||||||
|
|
||||||
if (retryCount < 0) {
|
if (retryCount < 0) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded", -1);
|
||||||
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ public final class HTTPLoader {
|
||||||
// check if url is in blacklist
|
// check if url is in blacklist
|
||||||
final String hostlow = host.toLowerCase();
|
final String hostlow = host.toLowerCase();
|
||||||
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
|
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist", -1);
|
||||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ public final class HTTPLoader {
|
||||||
redirectionUrlString = redirectionUrlString.trim();
|
redirectionUrlString = redirectionUrlString.trim();
|
||||||
|
|
||||||
if (redirectionUrlString.length() == 0) {
|
if (redirectionUrlString.length() == 0) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy", code);
|
||||||
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
|
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,14 +151,14 @@ public final class HTTPLoader {
|
||||||
|
|
||||||
// if we are already doing a shutdown we don't need to retry crawling
|
// if we are already doing a shutdown we don't need to retry crawling
|
||||||
if (Thread.currentThread().isInterrupted()) {
|
if (Thread.currentThread().isInterrupted()) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown", code);
|
||||||
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
|
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the url was already indexed
|
// check if the url was already indexed
|
||||||
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
|
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
|
||||||
if (dbname != null) {
|
if (dbname != null) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content", code);
|
||||||
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
|
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,12 +167,12 @@ public final class HTTPLoader {
|
||||||
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
|
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
|
||||||
} else {
|
} else {
|
||||||
// no redirection url provided
|
// no redirection url provided
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided", code);
|
||||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||||
}
|
}
|
||||||
} else if (responseBody == null) {
|
} else if (responseBody == null) {
|
||||||
// no response, reject file
|
// no response, reject file
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (code = " + code + ")");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body", code);
|
||||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||||
} else if (code == 200 || code == 203) {
|
} else if (code == 200 || code == 203) {
|
||||||
// the transfer is ok
|
// the transfer is ok
|
||||||
|
@ -183,7 +183,7 @@ public final class HTTPLoader {
|
||||||
|
|
||||||
// check length again in case it was not possible to get the length before loading
|
// check length again in case it was not possible to get the length before loading
|
||||||
if (maxFileSize > 0 && contentLength > maxFileSize) {
|
if (maxFileSize > 0 && contentLength > maxFileSize) {
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded", code);
|
||||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +201,7 @@ public final class HTTPLoader {
|
||||||
return response;
|
return response;
|
||||||
} else {
|
} else {
|
||||||
// if the response has not the right response type then reject file
|
// if the response has not the right response type then reject file
|
||||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code", code);
|
||||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -133,7 +133,8 @@ public class YMarkEntry extends TreeMap<String, String> {
|
||||||
case DATE_MODIFIED:
|
case DATE_MODIFIED:
|
||||||
case DATE_VISITED:
|
case DATE_VISITED:
|
||||||
this.put(b.key(), String.valueOf(System.currentTimeMillis()));
|
this.put(b.key(), String.valueOf(System.currentTimeMillis()));
|
||||||
default:
|
break;
|
||||||
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -112,7 +112,7 @@ public class YMarkTables {
|
||||||
this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
|
this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) throws IOException {
|
public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) {
|
||||||
final TreeMap<String,YMarkTag> tags = new TreeMap<String,YMarkTag>();
|
final TreeMap<String,YMarkTag> tags = new TreeMap<String,YMarkTag>();
|
||||||
Tables.Row bmk_row = null;
|
Tables.Row bmk_row = null;
|
||||||
Iterator<String> tit = null;
|
Iterator<String> tit = null;
|
||||||
|
|
|
@ -305,7 +305,7 @@ public final class HTTPDFileHandler {
|
||||||
final boolean accountEmpty = adminAccountBase64MD5.length() == 0;
|
final boolean accountEmpty = adminAccountBase64MD5.length() == 0;
|
||||||
final boolean softauth = accessFromLocalhost && authorization != null && authorization.length() > 6 && (adminAccountBase64MD5.equals(authorization.substring(6)));
|
final boolean softauth = accessFromLocalhost && authorization != null && authorization.length() > 6 && (adminAccountBase64MD5.equals(authorization.substring(6)));
|
||||||
|
|
||||||
if (protectedPage && ((!softauth && !grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
|
if (protectedPage && !softauth && ((!grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
|
||||||
// authentication required
|
// authentication required
|
||||||
if (authorization == null) {
|
if (authorization == null) {
|
||||||
// no authorization given in response. Ask for that
|
// no authorization given in response. Ask for that
|
||||||
|
|
|
@ -523,6 +523,11 @@ public final class Switchboard extends serverSwitch {
|
||||||
log.logConfig("Parser: Initializing Mime Type deny list");
|
log.logConfig("Parser: Initializing Mime Type deny list");
|
||||||
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
|
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
|
||||||
|
|
||||||
|
// set up the solr interface
|
||||||
|
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
|
||||||
|
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
|
||||||
|
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
|
||||||
|
|
||||||
// start a loader
|
// start a loader
|
||||||
log.logConfig("Starting Crawl Loader");
|
log.logConfig("Starting Crawl Loader");
|
||||||
this.loader = new LoaderDispatcher(this);
|
this.loader = new LoaderDispatcher(this);
|
||||||
|
@ -605,11 +610,6 @@ public final class Switchboard extends serverSwitch {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// set up the solr interface
|
|
||||||
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
|
|
||||||
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
|
|
||||||
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
|
|
||||||
|
|
||||||
// initializing dht chunk generation
|
// initializing dht chunk generation
|
||||||
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
|
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
|
||||||
|
|
||||||
|
@ -2423,7 +2423,7 @@ public final class Switchboard extends serverSwitch {
|
||||||
0,
|
0,
|
||||||
0,
|
0,
|
||||||
0);
|
0);
|
||||||
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
|
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
|
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
|
||||||
|
|
|
@ -59,8 +59,8 @@ public enum SolrScheme {
|
||||||
solrdoc.addField("id", id);
|
solrdoc.addField("id", id);
|
||||||
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
|
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
|
||||||
InetAddress address = Domains.dnsResolve(digestURI.getHost());
|
InetAddress address = Domains.dnsResolve(digestURI.getHost());
|
||||||
if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
|
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
|
||||||
if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
|
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
|
||||||
solrdoc.addField("title", yacydoc.dc_title());
|
solrdoc.addField("title", yacydoc.dc_title());
|
||||||
solrdoc.addField("author", yacydoc.dc_creator());
|
solrdoc.addField("author", yacydoc.dc_creator());
|
||||||
solrdoc.addField("description", yacydoc.dc_description());
|
solrdoc.addField("description", yacydoc.dc_description());
|
||||||
|
@ -68,7 +68,7 @@ public enum SolrScheme {
|
||||||
solrdoc.addField("last_modified", header.lastModified());
|
solrdoc.addField("last_modified", header.lastModified());
|
||||||
solrdoc.addField("keywords", yacydoc.dc_subject(' '));
|
solrdoc.addField("keywords", yacydoc.dc_subject(' '));
|
||||||
String content = UTF8.String(yacydoc.getTextBytes());
|
String content = UTF8.String(yacydoc.getTextBytes());
|
||||||
solrdoc.addField("attr_text", content);
|
solrdoc.addField("text_t", content);
|
||||||
int contentwc = content.split(" ").length;
|
int contentwc = content.split(" ").length;
|
||||||
solrdoc.addField("wordcount_i", contentwc);
|
solrdoc.addField("wordcount_i", contentwc);
|
||||||
|
|
||||||
|
@ -111,14 +111,14 @@ public enum SolrScheme {
|
||||||
solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray());
|
solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray());
|
||||||
|
|
||||||
// charset
|
// charset
|
||||||
solrdoc.addField("attr_charset", yacydoc.getCharset());
|
solrdoc.addField("charset_s", yacydoc.getCharset());
|
||||||
|
|
||||||
// coordinates
|
// coordinates
|
||||||
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
||||||
solrdoc.addField("lon_coordinate", yacydoc.lon());
|
solrdoc.addField("lon_coordinate", yacydoc.lon());
|
||||||
solrdoc.addField("lat_coordinate", yacydoc.lat());
|
solrdoc.addField("lat_coordinate", yacydoc.lat());
|
||||||
}
|
}
|
||||||
solrdoc.addField("attr_httpstatus", "200");
|
solrdoc.addField("httpstatus_i", 200);
|
||||||
Object parser = yacydoc.getParserObject();
|
Object parser = yacydoc.getParserObject();
|
||||||
if (parser instanceof ContentScraper) {
|
if (parser instanceof ContentScraper) {
|
||||||
ContentScraper html = (ContentScraper) parser;
|
ContentScraper html = (ContentScraper) parser;
|
||||||
|
@ -137,9 +137,9 @@ public enum SolrScheme {
|
||||||
// meta tags
|
// meta tags
|
||||||
Map<String, String> metas = html.getMetas();
|
Map<String, String> metas = html.getMetas();
|
||||||
String robots = metas.get("robots");
|
String robots = metas.get("robots");
|
||||||
if (robots != null) solrdoc.addField("attr_meta_robots", robots);
|
if (robots != null) solrdoc.addField("metarobots_t", robots);
|
||||||
String generator = metas.get("generator");
|
String generator = metas.get("generator");
|
||||||
if (generator != null) solrdoc.addField("attr_meta_generator", generator);
|
if (generator != null) solrdoc.addField("metagenerator_t", generator);
|
||||||
|
|
||||||
// bold, italic
|
// bold, italic
|
||||||
String[] bold = html.getBold();
|
String[] bold = html.getBold();
|
||||||
|
|
|
@ -26,6 +26,7 @@ package net.yacy.cora.services.federated.solr;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.InetAddress;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
@ -41,8 +42,11 @@ import org.apache.solr.client.solrj.response.QueryResponse;
|
||||||
import org.apache.solr.common.SolrDocumentList;
|
import org.apache.solr.common.SolrDocumentList;
|
||||||
import org.apache.solr.common.SolrInputDocument;
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
|
||||||
|
import net.yacy.cora.document.UTF8;
|
||||||
|
import net.yacy.cora.protocol.Domains;
|
||||||
import net.yacy.cora.protocol.ResponseHeader;
|
import net.yacy.cora.protocol.ResponseHeader;
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
|
|
||||||
|
|
||||||
|
@ -189,11 +193,10 @@ public class SolrSingleConnector {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void add(String id, ResponseHeader header, Document doc) throws IOException {
|
public void add(String id, ResponseHeader header, Document doc) throws IOException {
|
||||||
add(id, header, doc, this.scheme);
|
add(this.scheme.yacy2solr(id, header, doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException {
|
private void add(SolrInputDocument solrdoc) throws IOException {
|
||||||
SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc);
|
|
||||||
int thisrrc = this.transmissionRoundRobinCounter;
|
int thisrrc = this.transmissionRoundRobinCounter;
|
||||||
int nextrrc = thisrrc++;
|
int nextrrc = thisrrc++;
|
||||||
if (nextrrc >= transmissionQueueCount) nextrrc = 0;
|
if (nextrrc >= transmissionQueueCount) nextrrc = 0;
|
||||||
|
@ -223,6 +226,28 @@ public class SolrSingleConnector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException {
|
||||||
|
|
||||||
|
SolrInputDocument solrdoc = new SolrInputDocument();
|
||||||
|
solrdoc.addField("id", UTF8.String(digestURI.hash()));
|
||||||
|
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
|
||||||
|
InetAddress address = Domains.dnsResolve(digestURI.getHost());
|
||||||
|
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
|
||||||
|
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
|
||||||
|
|
||||||
|
// path elements of link
|
||||||
|
String path = digestURI.getPath();
|
||||||
|
if (path != null) {
|
||||||
|
String[] paths = path.split("/");
|
||||||
|
if (paths.length > 0) solrdoc.addField("attr_paths", paths);
|
||||||
|
}
|
||||||
|
|
||||||
|
solrdoc.addField("failreason_t", failReason);
|
||||||
|
solrdoc.addField("httpstatus_i", httpstatus);
|
||||||
|
|
||||||
|
add(solrdoc);
|
||||||
|
}
|
||||||
|
|
||||||
private void flushTransmissionQueue(int idx) throws IOException {
|
private void flushTransmissionQueue(int idx) throws IOException {
|
||||||
Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
|
Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
|
||||||
while (this.transmissionQueue[idx].size() > 0) {
|
while (this.transmissionQueue[idx].size() > 0) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user