- refactoring of robots

- added option to crawler to send error-URLs to solr
- changed solr scheme slightly (no multi-value fields where no multi values are)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7693 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-05-02 14:05:51 +00:00
parent 1ea0bc775c
commit 6fa439c82b
20 changed files with 156 additions and 124 deletions

View File

@ -385,7 +385,7 @@ public class Crawler_p {
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,
reasonString);
reasonString, -1);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url

View File

@ -105,7 +105,7 @@ public class WebStructurePicture_p {
} else {
// find start hash
String hash = null;
try {
if (host != null && host.length() > 0) try {
hash = UTF8.String((new DigestURI("http://" + host)).hash(), 6, 6);
} catch (final MalformedURLException e) {Log.logException(e);}
//assert (sb.webStructure.outgoingReferences(hash) != null);

View File

@ -9,7 +9,7 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.RobotsEntry;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -106,7 +106,7 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
RobotsEntry robotsEntry;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
} catch (IOException e) {

View File

@ -162,7 +162,7 @@ public final class crawlReceipt {
youare.getBytes(),
null,
0,
result + ":" + reason);
result + ":" + reason, -1);
//switchboard.noticeURL.remove(receivedUrlhash);
prop.put("delay", "3600");
return prop;

View File

@ -85,7 +85,8 @@ public class urls {
sb.peers.mySeed().hash.getBytes(),
new Date(),
0,
"client=____________");
"client=____________",
-1);
// create RSS entry
prop.put("item_" + c + "_title", "");

View File

@ -80,8 +80,8 @@ public class CrawlQueues {
log.logConfig("Starting Crawling Management");
noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
}
public void relocate(final File newQueuePath) {
@ -92,8 +92,8 @@ public class CrawlQueues {
noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
errorURL = new ZURL(sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
}
public void close() {
@ -571,7 +571,7 @@ public class CrawlQueues {
try {
// checking robots.txt for http(s) resources
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
RobotsEntry robotsEntry;
RobotsTxtEntry robotsEntry;
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
(robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
robotsEntry.isDisallowed(request.url())) {
@ -581,7 +581,7 @@ public class CrawlQueues {
UTF8.getBytes(sb.peers.mySeed().hash),
new Date(),
1,
"denied by robots.txt");
"denied by robots.txt", -1);
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
} else {
// starting a load from the internet
@ -617,7 +617,7 @@ public class CrawlQueues {
UTF8.getBytes(sb.peers.mySeed().hash),
new Date(),
1,
"cannot load: " + result);
"cannot load: " + result, -1);
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else {
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
@ -629,7 +629,7 @@ public class CrawlQueues {
UTF8.getBytes(sb.peers.mySeed().hash),
new Date(),
1,
e.getMessage() + " - in worker");
e.getMessage() + " - in worker", -1);
Log.logException(e);
// Client.initConnectionManager();
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);

View File

@ -202,7 +202,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason);
nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason, -1);
}
} catch (final Exception e) {
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -469,9 +469,9 @@ public final class CrawlStacker {
}
// deny cgi
if (url.isIndividual()) {
if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
return "cgi url not allowed";
return "individual url (sessionid etc) not wanted";
}
// deny post properties

View File

@ -186,7 +186,7 @@ public class Latency {
// find the delay as given by robots.txt on target site
long robotsDelay = 0;
if (!local) {
RobotsEntry robotsEntry;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
} catch (IOException e) {
@ -239,7 +239,7 @@ public class Latency {
// find the delay as given by robots.txt on target site
long robotsDelay = 0;
if (!local) {
RobotsEntry robotsEntry;
RobotsTxtEntry robotsEntry;
try {
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
} catch (IOException e) {

View File

@ -11,16 +11,16 @@
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//it under the terms of the GNU General public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//GNU General public License for more details.
//
//You should have received a copy of the GNU General Public License
//You should have received a copy of the GNU General public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -51,15 +51,15 @@ public class RobotsTxt {
private static Logger log = Logger.getLogger(RobotsTxt.class);
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
BEncodedHeap robotsTable;
private final ConcurrentHashMap<String, DomSync> syncObjects;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
private static class DomSync {
public DomSync() {}
private DomSync() {}
}
public RobotsTxt(final BEncodedHeap robotsTable) {
@ -78,16 +78,16 @@ public class RobotsTxt {
return this.robotsTable.size();
}
public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(theURL, thisAgents, true);
}
private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
// this method will always return a non-null value
String urlHostPort = getHostPort(theURL);
RobotsEntry robotsTxt4Host = null;
RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record;
try {
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
@ -95,7 +95,7 @@ public class RobotsTxt {
log.warn("memory exhausted", e);
record = null;
}
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
if (fetchOnlineIfNotAvailableOrNotFresh && (
robotsTxt4Host == null ||
@ -123,7 +123,7 @@ public class RobotsTxt {
log.warn("memory exhausted", e);
record = null;
}
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
if (robotsTxt4Host != null &&
robotsTxt4Host.getLoadedDate() != null &&
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
@ -160,7 +160,7 @@ public class RobotsTxt {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new RobotsEntry(
robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
@ -183,7 +183,7 @@ public class RobotsTxt {
addEntry(robotsTxt4Host);
}
} else {
final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
ArrayList<String> denyPath = parserResult.denyList();
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
@ -208,7 +208,7 @@ public class RobotsTxt {
return robotsTxt4Host;
}
private RobotsEntry addEntry(
private RobotsTxtEntry addEntry(
final MultiProtocolURI theURL,
final ArrayList<String> allowPathList,
final ArrayList<String> denyPathList,
@ -219,7 +219,7 @@ public class RobotsTxt {
final long crawlDelayMillis,
final String agentName
) {
final RobotsEntry entry = new RobotsEntry(
final RobotsTxtEntry entry = new RobotsTxtEntry(
theURL, allowPathList, denyPathList,
loadedDate, modDate,
eTag, sitemap, crawlDelayMillis, agentName);
@ -227,7 +227,7 @@ public class RobotsTxt {
return entry;
}
private String addEntry(final RobotsEntry entry) {
private String addEntry(final RobotsTxtEntry entry) {
// writes a new page and returns key
try {
this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
@ -240,10 +240,10 @@ public class RobotsTxt {
// methods that had been in robotsParser.java:
public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
public static final int DOWNLOAD_ROBOTS_TXT = 1;
public static final int DOWNLOAD_ETAG = 2;
public static final int DOWNLOAD_MODDATE = 3;
private static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
private static final int DOWNLOAD_ROBOTS_TXT = 1;
private static final int DOWNLOAD_ETAG = 2;
private static final int DOWNLOAD_MODDATE = 3;
static final String getHostPort(final MultiProtocolURI theURL) {
String urlHostPort = null;
@ -267,7 +267,7 @@ public class RobotsTxt {
return port;
}
private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception {
if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};

View File

@ -13,16 +13,16 @@
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//it under the terms of the GNU General public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//GNU General public License for more details.
//
//You should have received a copy of the GNU General Public License
//You should have received a copy of the GNU General public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -41,25 +41,25 @@ import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.util.ByteArray;
public class RobotsEntry {
public class RobotsTxtEntry {
public static final String HOST_NAME = "hostname";
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
public static final String AGENT_NAME = "agentname";
private static final String HOST_NAME = "hostname";
private static final String ALLOW_PATH_LIST = "allow";
private static final String DISALLOW_PATH_LIST = "disallow";
private static final String LOADED_DATE = "date";
private static final String MOD_DATE = "modDate";
private static final String ETAG = "etag";
private static final String SITEMAP = "sitemap";
private static final String CRAWL_DELAY = "crawlDelay";
private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
private static final String AGENT_NAME = "agentname";
// this is a simple record structure that holds all properties of a single crawl start
private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList;
private final String hostName, agentName;
public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
@ -90,7 +90,7 @@ public class RobotsEntry {
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
}
public RobotsEntry(
protected RobotsTxtEntry(
final MultiProtocolURI theURL,
final List<String> allowPathList,
final List<String> disallowPathList,
@ -140,15 +140,15 @@ public class RobotsEntry {
}
}
public String getHostName() {
protected String getHostName() {
return this.hostName;
}
public String getAgentName() {
protected String getAgentName() {
return this.agentName;
}
public Map<String, byte[]> getMem() {
protected Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
return this.mem;
}
@ -175,34 +175,34 @@ public class RobotsEntry {
}
}
public Date getLoadedDate() {
protected Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
}
return null;
}
public void setLoadedDate(final Date newLoadedDate) {
protected void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
}
}
public Date getModDate() {
protected Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
}
return null;
}
public String getETag() {
protected String getETag() {
if (this.mem.containsKey(ETAG)) {
return UTF8.String(this.mem.get(ETAG));
}
return null;
}
public long getCrawlDelayMillis() {
protected long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) {

View File

@ -10,16 +10,16 @@
Revision: $LastChangedRevision$
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
it under the terms of the GNU General public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
GNU General public License for more details.
You should have received a copy of the GNU General Public License
You should have received a copy of the GNU General private License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -59,16 +59,16 @@ import java.util.regex.Pattern;
* See: http://www.kollar.com/robots.html
*/
public final class robotsParser {
public final class RobotsTxtParser {
private static final Pattern patternTab = Pattern.compile("\t");
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
private static final String ROBOTS_COMMENT = "#";
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private final ArrayList<String> allowList;
private final ArrayList<String> denyList;
@ -77,7 +77,7 @@ public final class robotsParser {
private final Set<String> myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
public robotsParser(final byte[] robotsTxt, final Set<String> myNames) {
protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
@ -91,16 +91,6 @@ public final class robotsParser {
}
}
public robotsParser(final BufferedReader reader, final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
this.crawlDelayMillis = 0;
this.myNames = myNames;
this.agentName = null;
if (reader != null) parse(reader);
}
private void parse(final BufferedReader reader) {
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
@ -260,7 +250,7 @@ public final class robotsParser {
* does not make any no-DOS-forced crawl pause.
* @return the crawl delay between two crawl access times in milliseconds
*/
public long crawlDelayMillis() {
protected long crawlDelayMillis() {
return this.crawlDelayMillis;
}
@ -271,19 +261,19 @@ public final class robotsParser {
* Effects: see also comment to crawlDelayMillis()
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
*/
public String agentName() {
protected String agentName() {
return this.agentName;
}
public String sitemap() {
protected String sitemap() {
return this.sitemap;
}
public ArrayList<String> allowList() {
protected ArrayList<String> allowList() {
return this.allowList;
}
public ArrayList<String> denyList() {
protected ArrayList<String> denyList() {
return this.denyList;
}
}

View File

@ -34,6 +34,7 @@ import java.util.Iterator;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index;
@ -66,13 +67,16 @@ public class ZURL implements Iterable<ZURL.Entry> {
// the class object
private Index urlIndex;
private final ConcurrentLinkedQueue<byte[]> stack;
private final SolrSingleConnector solrConnector;
public ZURL(
final SolrSingleConnector solrConnector,
final File cachePath,
final String tablename,
final boolean startWithEmptyFile,
final boolean useTailCache,
final boolean exceed134217727) {
this.solrConnector = solrConnector;
// creates a new ZURL in a file
cachePath.mkdirs();
final File f = new File(cachePath, tablename);
@ -94,7 +98,8 @@ public class ZURL implements Iterable<ZURL.Entry> {
this.stack = new ConcurrentLinkedQueue<byte[]>();
}
public ZURL() {
public ZURL(final SolrSingleConnector solrConnector) {
this.solrConnector = solrConnector;
// creates a new ZUR in RAM
this.urlIndex = new RowSet(rowdef);
this.stack = new ConcurrentLinkedQueue<byte[]>();
@ -126,14 +131,24 @@ public class ZURL implements Iterable<ZURL.Entry> {
final byte[] executor,
final Date workdate,
final int workcount,
String anycause) {
String anycause,
int httpcode) {
// assert executor != null; // null == proxy !
if (exists(bentry.url().hash())) return; // don't insert double causes
if (anycause == null) anycause = "unknown";
Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
put(entry);
stack.add(entry.hash());
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + anycause);
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
if (this.solrConnector != null) {
// send the error to solr
try {
this.solrConnector.err(bentry.url(), reason, httpcode);
} catch (IOException e) {
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
}
}
while (stack.size() > maxStackSize) stack.poll();
}

View File

@ -152,7 +152,7 @@ public class FTPLoader {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail, -1);
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
}

View File

@ -78,7 +78,7 @@ public final class HTTPLoader {
private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded", -1);
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -94,7 +94,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -138,7 +138,7 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
}
@ -151,14 +151,14 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown", code);
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
}
// check if the url was already indexed
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
if (dbname != null) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
}
@ -167,12 +167,12 @@ public final class HTTPLoader {
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
} else {
// no redirection url provided
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided", code);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
} else if (responseBody == null) {
// no response, reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (code = " + code + ")");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body", code);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} else if (code == 200 || code == 203) {
// the transfer is ok
@ -183,7 +183,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize > 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded", code);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
@ -201,7 +201,7 @@ public final class HTTPLoader {
return response;
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code", code);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
}

View File

@ -133,6 +133,7 @@ public class YMarkEntry extends TreeMap<String, String> {
case DATE_MODIFIED:
case DATE_VISITED:
this.put(b.key(), String.valueOf(System.currentTimeMillis()));
break;
default:
break;
}

View File

@ -112,7 +112,7 @@ public class YMarkTables {
this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
}
public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) throws IOException {
public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) {
final TreeMap<String,YMarkTag> tags = new TreeMap<String,YMarkTag>();
Tables.Row bmk_row = null;
Iterator<String> tit = null;

View File

@ -305,7 +305,7 @@ public final class HTTPDFileHandler {
final boolean accountEmpty = adminAccountBase64MD5.length() == 0;
final boolean softauth = accessFromLocalhost && authorization != null && authorization.length() > 6 && (adminAccountBase64MD5.equals(authorization.substring(6)));
if (protectedPage && ((!softauth && !grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
if (protectedPage && !softauth && ((!grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
// authentication required
if (authorization == null) {
// no authorization given in response. Ask for that

View File

@ -523,6 +523,11 @@ public final class Switchboard extends serverSwitch {
log.logConfig("Parser: Initializing Mime Type deny list");
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
// set up the solr interface
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
// start a loader
log.logConfig("Starting Crawl Loader");
this.loader = new LoaderDispatcher(this);
@ -605,11 +610,6 @@ public final class Switchboard extends serverSwitch {
}
}
// set up the solr interface
String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
// initializing dht chunk generation
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
@ -2423,7 +2423,7 @@ public final class Switchboard extends serverSwitch {
0,
0,
0);
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason, -1);
}
public final void heuristicSite(final SearchEvent searchEvent, final String host) {

View File

@ -59,8 +59,8 @@ public enum SolrScheme {
solrdoc.addField("id", id);
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
@ -68,7 +68,7 @@ public enum SolrScheme {
solrdoc.addField("last_modified", header.lastModified());
solrdoc.addField("keywords", yacydoc.dc_subject(' '));
String content = UTF8.String(yacydoc.getTextBytes());
solrdoc.addField("attr_text", content);
solrdoc.addField("text_t", content);
int contentwc = content.split(" ").length;
solrdoc.addField("wordcount_i", contentwc);
@ -111,14 +111,14 @@ public enum SolrScheme {
solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray());
// charset
solrdoc.addField("attr_charset", yacydoc.getCharset());
solrdoc.addField("charset_s", yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
solrdoc.addField("lon_coordinate", yacydoc.lon());
solrdoc.addField("lat_coordinate", yacydoc.lat());
}
solrdoc.addField("attr_httpstatus", "200");
solrdoc.addField("httpstatus_i", 200);
Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
ContentScraper html = (ContentScraper) parser;
@ -137,9 +137,9 @@ public enum SolrScheme {
// meta tags
Map<String, String> metas = html.getMetas();
String robots = metas.get("robots");
if (robots != null) solrdoc.addField("attr_meta_robots", robots);
if (robots != null) solrdoc.addField("metarobots_t", robots);
String generator = metas.get("generator");
if (generator != null) solrdoc.addField("attr_meta_generator", generator);
if (generator != null) solrdoc.addField("metagenerator_t", generator);
// bold, italic
String[] bold = html.getBold();

View File

@ -26,6 +26,7 @@ package net.yacy.cora.services.federated.solr;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
@ -41,8 +42,11 @@ import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -189,11 +193,10 @@ public class SolrSingleConnector {
*/
public void add(String id, ResponseHeader header, Document doc) throws IOException {
add(id, header, doc, this.scheme);
add(this.scheme.yacy2solr(id, header, doc));
}
public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException {
SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc);
private void add(SolrInputDocument solrdoc) throws IOException {
int thisrrc = this.transmissionRoundRobinCounter;
int nextrrc = thisrrc++;
if (nextrrc >= transmissionQueueCount) nextrrc = 0;
@ -223,6 +226,28 @@ public class SolrSingleConnector {
}
}
public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException {
SolrInputDocument solrdoc = new SolrInputDocument();
solrdoc.addField("id", UTF8.String(digestURI.hash()));
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
// path elements of link
String path = digestURI.getPath();
if (path != null) {
String[] paths = path.split("/");
if (paths.length > 0) solrdoc.addField("attr_paths", paths);
}
solrdoc.addField("failreason_t", failReason);
solrdoc.addField("httpstatus_i", httpstatus);
add(solrdoc);
}
private void flushTransmissionQueue(int idx) throws IOException {
Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
while (this.transmissionQueue[idx].size() > 0) {