diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7056a4e03..49ac31300 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -385,7 +385,7 @@ public class Crawler_p { sb.peers.mySeed().hash.getBytes(), new Date(), 1, - reasonString); + reasonString, -1); } } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 310a203d6..966ab461d 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -105,7 +105,7 @@ public class WebStructurePicture_p { } else { // find start hash String hash = null; - try { + if (host != null && host.length() > 0) try { hash = UTF8.String((new DigestURI("http://" + host)).hash(), 6, 6); } catch (final MalformedURLException e) {Log.logException(e);} //assert (sb.webStructure.outgoingReferences(hash) != null); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 5e2662334..f4aadb818 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -9,7 +9,7 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.RobotsEntry; +import de.anomic.crawler.RobotsTxtEntry; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -106,7 +106,7 @@ public class getpageinfo_p { final DigestURI theURL = new DigestURI(url); // determine if crawling of the current URL is allowed - RobotsEntry robotsEntry; + RobotsTxtEntry robotsEntry; try { robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); } catch (IOException e) { diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 7e41d7e69..a12dcc8ce 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -162,7 +162,7 @@ public final class crawlReceipt { youare.getBytes(), null, 0, - result + ":" + reason); + result + ":" + reason, -1); //switchboard.noticeURL.remove(receivedUrlhash); prop.put("delay", "3600"); return prop; diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 4c749e0e1..f4b972377 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -85,7 +85,8 @@ public class urls { sb.peers.mySeed().hash.getBytes(), new Date(), 0, - "client=____________"); + "client=____________", + -1); // create RSS entry prop.put("item_" + c + "_title", ""); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 6a59bfbe5..2d00c4f7f 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -80,8 +80,8 @@ public class CrawlQueues { log.logConfig("Starting Crawling Management"); noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); - errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); + errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); + delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); } public void relocate(final File newQueuePath) { @@ -92,8 +92,8 @@ public class CrawlQueues { noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); - errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); + errorURL = new ZURL(sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); + delegatedURL = new ZURL(sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); } public void close() { @@ -571,7 +571,7 @@ public class CrawlQueues { try { // checking robots.txt for http(s) resources this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED); - RobotsEntry robotsEntry; + RobotsTxtEntry robotsEntry; if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && (robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null && robotsEntry.isDisallowed(request.url())) { @@ -581,7 +581,7 @@ public class CrawlQueues { UTF8.getBytes(sb.peers.mySeed().hash), new Date(), 1, - "denied by robots.txt"); + "denied by robots.txt", -1); this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED); } else { // starting a load from the internet @@ -617,7 +617,7 @@ public class CrawlQueues { UTF8.getBytes(sb.peers.mySeed().hash), new Date(), 1, - "cannot load: " + result); + "cannot load: " + result, -1); this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED); } else { this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED); @@ -629,7 +629,7 @@ public class CrawlQueues { UTF8.getBytes(sb.peers.mySeed().hash), new Date(), 1, - e.getMessage() + " - in worker"); + e.getMessage() + " - in worker", -1); Log.logException(e); // Client.initConnectionManager(); this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index e6dae888e..85952dc9c 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -202,7 +202,7 @@ public final class CrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null) { - nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason); + nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason, -1); } } catch (final Exception e) { CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e); @@ -469,9 +469,9 @@ public final class CrawlStacker { } // deny cgi - if (url.isIndividual()) { + if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL."); - return "cgi url not allowed"; + return "individual url (sessionid etc) not wanted"; } // deny post properties diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index ad7f58426..2ce81adaf 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -186,7 +186,7 @@ public class Latency { // find the delay as given by robots.txt on target site long robotsDelay = 0; if (!local) { - RobotsEntry robotsEntry; + RobotsTxtEntry robotsEntry; try { robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents); } catch (IOException e) { @@ -239,7 +239,7 @@ public class Latency { // find the delay as given by robots.txt on target site long robotsDelay = 0; if (!local) { - RobotsEntry robotsEntry; + RobotsTxtEntry robotsEntry; try { robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents); } catch (IOException e) { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 44efbd7dd..6952e65c8 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -11,16 +11,16 @@ //Revision: $LastChangedRevision$ // //This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by +//it under the terms of the GNU General public License as published by //the Free Software Foundation; either version 2 of the License, or //(at your option) any later version. // //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. +//GNU General public License for more details. // -//You should have received a copy of the GNU General Public License +//You should have received a copy of the GNU General public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -51,15 +51,15 @@ public class RobotsTxt { private static Logger log = Logger.getLogger(RobotsTxt.class); - public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; - public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); + protected static final String ROBOTS_DB_PATH_SEPARATOR = ";"; + protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); BEncodedHeap robotsTable; private final ConcurrentHashMap syncObjects; //private static final HashSet loadedRobots = new HashSet(); // only for debugging private static class DomSync { - public DomSync() {} + private DomSync() {} } public RobotsTxt(final BEncodedHeap robotsTable) { @@ -78,16 +78,16 @@ public class RobotsTxt { return this.robotsTable.size(); } - public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { + public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { if (theURL == null) throw new IllegalArgumentException(); if (!theURL.getProtocol().startsWith("http")) return null; return getEntry(theURL, thisAgents, true); } - private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { + private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { // this method will always return a non-null value String urlHostPort = getHostPort(theURL); - RobotsEntry robotsTxt4Host = null; + RobotsTxtEntry robotsTxt4Host = null; Map record; try { record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); @@ -95,7 +95,7 @@ public class RobotsTxt { log.warn("memory exhausted", e); record = null; } - if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); + if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record); if (fetchOnlineIfNotAvailableOrNotFresh && ( robotsTxt4Host == null || @@ -123,7 +123,7 @@ public class RobotsTxt { log.warn("memory exhausted", e); record = null; } - if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); + if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record); if (robotsTxt4Host != null && robotsTxt4Host.getLoadedDate() != null && System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) { @@ -160,7 +160,7 @@ public class RobotsTxt { // no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) { // generate artificial entry - robotsTxt4Host = new RobotsEntry( + robotsTxt4Host = new RobotsTxtEntry( robotsURL, new ArrayList(), new ArrayList(), @@ -183,7 +183,7 @@ public class RobotsTxt { addEntry(robotsTxt4Host); } } else { - final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents); + final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents); ArrayList denyPath = parserResult.denyList(); if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { denyPath = new ArrayList(); @@ -208,7 +208,7 @@ public class RobotsTxt { return robotsTxt4Host; } - private RobotsEntry addEntry( + private RobotsTxtEntry addEntry( final MultiProtocolURI theURL, final ArrayList allowPathList, final ArrayList denyPathList, @@ -219,7 +219,7 @@ public class RobotsTxt { final long crawlDelayMillis, final String agentName ) { - final RobotsEntry entry = new RobotsEntry( + final RobotsTxtEntry entry = new RobotsTxtEntry( theURL, allowPathList, denyPathList, loadedDate, modDate, eTag, sitemap, crawlDelayMillis, agentName); @@ -227,7 +227,7 @@ public class RobotsTxt { return entry; } - private String addEntry(final RobotsEntry entry) { + private String addEntry(final RobotsTxtEntry entry) { // writes a new page and returns key try { this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem()); @@ -240,10 +240,10 @@ public class RobotsTxt { // methods that had been in robotsParser.java: - public static final int DOWNLOAD_ACCESS_RESTRICTED = 0; - public static final int DOWNLOAD_ROBOTS_TXT = 1; - public static final int DOWNLOAD_ETAG = 2; - public static final int DOWNLOAD_MODDATE = 3; + private static final int DOWNLOAD_ACCESS_RESTRICTED = 0; + private static final int DOWNLOAD_ROBOTS_TXT = 1; + private static final int DOWNLOAD_ETAG = 2; + private static final int DOWNLOAD_MODDATE = 3; static final String getHostPort(final MultiProtocolURI theURL) { String urlHostPort = null; @@ -267,7 +267,7 @@ public class RobotsTxt { return port; } - private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception { + private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception { if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null; if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; diff --git a/source/de/anomic/crawler/RobotsEntry.java b/source/de/anomic/crawler/RobotsTxtEntry.java similarity index 86% rename from source/de/anomic/crawler/RobotsEntry.java rename to source/de/anomic/crawler/RobotsTxtEntry.java index c5f04dec1..c0e65be83 100644 --- a/source/de/anomic/crawler/RobotsEntry.java +++ b/source/de/anomic/crawler/RobotsTxtEntry.java @@ -13,16 +13,16 @@ //Revision: $LastChangedRevision$ // //This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by +//it under the terms of the GNU General public License as published by //the Free Software Foundation; either version 2 of the License, or //(at your option) any later version. // //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. +//GNU General public License for more details. // -//You should have received a copy of the GNU General Public License +//You should have received a copy of the GNU General public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -41,25 +41,25 @@ import net.yacy.cora.document.UTF8; import net.yacy.kelondro.util.ByteArray; -public class RobotsEntry { +public class RobotsTxtEntry { - public static final String HOST_NAME = "hostname"; - public static final String ALLOW_PATH_LIST = "allow"; - public static final String DISALLOW_PATH_LIST = "disallow"; - public static final String LOADED_DATE = "date"; - public static final String MOD_DATE = "modDate"; - public static final String ETAG = "etag"; - public static final String SITEMAP = "sitemap"; - public static final String CRAWL_DELAY = "crawlDelay"; - public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; - public static final String AGENT_NAME = "agentname"; + private static final String HOST_NAME = "hostname"; + private static final String ALLOW_PATH_LIST = "allow"; + private static final String DISALLOW_PATH_LIST = "disallow"; + private static final String LOADED_DATE = "date"; + private static final String MOD_DATE = "modDate"; + private static final String ETAG = "etag"; + private static final String SITEMAP = "sitemap"; + private static final String CRAWL_DELAY = "crawlDelay"; + private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; + private static final String AGENT_NAME = "agentname"; // this is a simple record structure that holds all properties of a single crawl start private final Map mem; private final List allowPathList, denyPathList; private final String hostName, agentName; - public RobotsEntry(final String hostName, final Map mem) { + protected RobotsTxtEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); this.mem = mem; @@ -90,7 +90,7 @@ public class RobotsEntry { this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null; } - public RobotsEntry( + protected RobotsTxtEntry( final MultiProtocolURI theURL, final List allowPathList, final List disallowPathList, @@ -140,15 +140,15 @@ public class RobotsEntry { } } - public String getHostName() { + protected String getHostName() { return this.hostName; } - public String getAgentName() { + protected String getAgentName() { return this.agentName; } - public Map getMem() { + protected Map getMem() { if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); return this.mem; } @@ -175,34 +175,34 @@ public class RobotsEntry { } } - public Date getLoadedDate() { + protected Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE))); } return null; } - public void setLoadedDate(final Date newLoadedDate) { + protected void setLoadedDate(final Date newLoadedDate) { if (newLoadedDate != null) { this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime()))); } } - public Date getModDate() { + protected Date getModDate() { if (this.mem.containsKey(MOD_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE))); } return null; } - public String getETag() { + protected String getETag() { if (this.mem.containsKey(ETAG)) { return UTF8.String(this.mem.get(ETAG)); } return null; } - public long getCrawlDelayMillis() { + protected long getCrawlDelayMillis() { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); } catch (final NumberFormatException e) { diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/RobotsTxtParser.java similarity index 89% rename from source/de/anomic/crawler/robotsParser.java rename to source/de/anomic/crawler/RobotsTxtParser.java index 7d09de258..390010da3 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/RobotsTxtParser.java @@ -10,16 +10,16 @@ Revision: $LastChangedRevision$ This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by + it under the terms of the GNU General public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + GNU General public License for more details. - You should have received a copy of the GNU General Public License + You should have received a copy of the GNU General private License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -59,16 +59,16 @@ import java.util.regex.Pattern; * See: http://www.kollar.com/robots.html */ -public final class robotsParser { +public final class RobotsTxtParser { private static final Pattern patternTab = Pattern.compile("\t"); - public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); - public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); - public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); - public static final String ROBOTS_COMMENT = "#"; - public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); - public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase(); + private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); + private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); + private static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); + private static final String ROBOTS_COMMENT = "#"; + private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); + private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase(); private final ArrayList allowList; private final ArrayList denyList; @@ -77,7 +77,7 @@ public final class robotsParser { private final Set myNames; // a list of own name lists private String agentName; // the name of the agent that was used to return the result - public robotsParser(final byte[] robotsTxt, final Set myNames) { + protected RobotsTxtParser(final byte[] robotsTxt, final Set myNames) { this.allowList = new ArrayList(0); this.denyList = new ArrayList(0); this.sitemap = ""; @@ -91,16 +91,6 @@ public final class robotsParser { } } - public robotsParser(final BufferedReader reader, final Set myNames) { - this.allowList = new ArrayList(0); - this.denyList = new ArrayList(0); - this.sitemap = ""; - this.crawlDelayMillis = 0; - this.myNames = myNames; - this.agentName = null; - if (reader != null) parse(reader); - } - private void parse(final BufferedReader reader) { final ArrayList deny4AllAgents = new ArrayList(); final ArrayList deny4ThisAgents = new ArrayList(); @@ -260,7 +250,7 @@ public final class robotsParser { * does not make any no-DOS-forced crawl pause. * @return the crawl delay between two crawl access times in milliseconds */ - public long crawlDelayMillis() { + protected long crawlDelayMillis() { return this.crawlDelayMillis; } @@ -271,19 +261,19 @@ public final class robotsParser { * Effects: see also comment to crawlDelayMillis() * @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent */ - public String agentName() { + protected String agentName() { return this.agentName; } - public String sitemap() { + protected String sitemap() { return this.sitemap; } - public ArrayList allowList() { + protected ArrayList allowList() { return this.allowList; } - public ArrayList denyList() { + protected ArrayList denyList() { return this.denyList; } } diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index b5035e1f2..e0daf3fb0 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -34,6 +34,7 @@ import java.util.Iterator; import java.util.concurrent.ConcurrentLinkedQueue; import net.yacy.cora.document.UTF8; +import net.yacy.cora.services.federated.solr.SolrSingleConnector; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; @@ -66,13 +67,16 @@ public class ZURL implements Iterable { // the class object private Index urlIndex; private final ConcurrentLinkedQueue stack; + private final SolrSingleConnector solrConnector; public ZURL( + final SolrSingleConnector solrConnector, final File cachePath, final String tablename, final boolean startWithEmptyFile, final boolean useTailCache, final boolean exceed134217727) { + this.solrConnector = solrConnector; // creates a new ZURL in a file cachePath.mkdirs(); final File f = new File(cachePath, tablename); @@ -94,7 +98,8 @@ public class ZURL implements Iterable { this.stack = new ConcurrentLinkedQueue(); } - public ZURL() { + public ZURL(final SolrSingleConnector solrConnector) { + this.solrConnector = solrConnector; // creates a new ZUR in RAM this.urlIndex = new RowSet(rowdef); this.stack = new ConcurrentLinkedQueue(); @@ -126,14 +131,24 @@ public class ZURL implements Iterable { final byte[] executor, final Date workdate, final int workcount, - String anycause) { + String anycause, + int httpcode) { // assert executor != null; // null == proxy ! if (exists(bentry.url().hash())) return; // don't insert double causes if (anycause == null) anycause = "unknown"; - Entry entry = new Entry(bentry, executor, workdate, workcount, anycause); + String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : ""); + Entry entry = new Entry(bentry, executor, workdate, workcount, reason); put(entry); stack.add(entry.hash()); - Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + anycause); + Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason); + if (this.solrConnector != null) { + // send the error to solr + try { + this.solrConnector.err(bentry.url(), reason, httpcode); + } catch (IOException e) { + Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true, false) + " to solr: " + e.getMessage()); + } + } while (stack.size() > maxStackSize) stack.poll(); } diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 1775ed83c..020e05bff 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -152,7 +152,7 @@ public class FTPLoader { if (berr.size() > 0 || response == null) { // some error logging final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : ""; - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail, -1); throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index f33b6da4c..038607a71 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -78,7 +78,7 @@ public final class HTTPLoader { private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException { if (retryCount < 0) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded", -1); throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } @@ -94,7 +94,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -138,7 +138,7 @@ public final class HTTPLoader { redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy", code); throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty."); } @@ -151,14 +151,14 @@ public final class HTTPLoader { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown", code); throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown."); } // check if the url was already indexed final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash()); if (dbname != null) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content", code); throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname); } @@ -167,12 +167,12 @@ public final class HTTPLoader { return load(request, retryCount - 1, maxFileSize, checkBlacklist); } else { // no redirection url provided - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided", code); throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } } else if (responseBody == null) { // no response, reject file - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (code = " + code + ")"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body", code); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } else if (code == 200 || code == 203) { // the transfer is ok @@ -183,7 +183,7 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize > 0 && contentLength > maxFileSize) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded", code); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); } @@ -201,7 +201,7 @@ public final class HTTPLoader { return response; } else { // if the response has not the right response type then reject file - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code", code); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } } diff --git a/source/de/anomic/data/ymark/YMarkEntry.java b/source/de/anomic/data/ymark/YMarkEntry.java index e3f083c19..bbed53bf1 100644 --- a/source/de/anomic/data/ymark/YMarkEntry.java +++ b/source/de/anomic/data/ymark/YMarkEntry.java @@ -133,7 +133,8 @@ public class YMarkEntry extends TreeMap { case DATE_MODIFIED: case DATE_VISITED: this.put(b.key(), String.valueOf(System.currentTimeMillis())); - default: + break; + default: break; } } diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java index 9aa7f2eb3..68ae94736 100644 --- a/source/de/anomic/data/ymark/YMarkTables.java +++ b/source/de/anomic/data/ymark/YMarkTables.java @@ -112,7 +112,7 @@ public class YMarkTables { this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url)); } - public TreeMap getTags(final Iterator rowIterator) throws IOException { + public TreeMap getTags(final Iterator rowIterator) { final TreeMap tags = new TreeMap(); Tables.Row bmk_row = null; Iterator tit = null; diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 03a6ffee1..612a51694 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -305,7 +305,7 @@ public final class HTTPDFileHandler { final boolean accountEmpty = adminAccountBase64MD5.length() == 0; final boolean softauth = accessFromLocalhost && authorization != null && authorization.length() > 6 && (adminAccountBase64MD5.equals(authorization.substring(6))); - if (protectedPage && ((!softauth && !grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) { + if (protectedPage && !softauth && ((!grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) { // authentication required if (authorization == null) { // no authorization given in response. Ask for that diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 24be89906..4f8387f6b 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -523,6 +523,11 @@ public final class Switchboard extends serverSwitch { log.logConfig("Parser: Initializing Mime Type deny list"); TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); + // set up the solr interface + String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); + boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0; + this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null; + // start a loader log.logConfig("Starting Crawl Loader"); this.loader = new LoaderDispatcher(this); @@ -605,11 +610,6 @@ public final class Switchboard extends serverSwitch { } } - // set up the solr interface - String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); - boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0; - this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null; - // initializing dht chunk generation this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); @@ -2423,7 +2423,7 @@ public final class Switchboard extends serverSwitch { 0, 0, 0); - crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); + crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason, -1); } public final void heuristicSite(final SearchEvent searchEvent, final String host) { diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index b080849db..a94d0759d 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -59,8 +59,8 @@ public enum SolrScheme { solrdoc.addField("id", id); solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); InetAddress address = Domains.dnsResolve(digestURI.getHost()); - if (address != null) solrdoc.addField("attr_ip", address.getHostAddress()); - if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost()); + if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); + if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); solrdoc.addField("title", yacydoc.dc_title()); solrdoc.addField("author", yacydoc.dc_creator()); solrdoc.addField("description", yacydoc.dc_description()); @@ -68,7 +68,7 @@ public enum SolrScheme { solrdoc.addField("last_modified", header.lastModified()); solrdoc.addField("keywords", yacydoc.dc_subject(' ')); String content = UTF8.String(yacydoc.getTextBytes()); - solrdoc.addField("attr_text", content); + solrdoc.addField("text_t", content); int contentwc = content.split(" ").length; solrdoc.addField("wordcount_i", contentwc); @@ -111,14 +111,14 @@ public enum SolrScheme { solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray()); // charset - solrdoc.addField("attr_charset", yacydoc.getCharset()); + solrdoc.addField("charset_s", yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { solrdoc.addField("lon_coordinate", yacydoc.lon()); solrdoc.addField("lat_coordinate", yacydoc.lat()); } - solrdoc.addField("attr_httpstatus", "200"); + solrdoc.addField("httpstatus_i", 200); Object parser = yacydoc.getParserObject(); if (parser instanceof ContentScraper) { ContentScraper html = (ContentScraper) parser; @@ -137,9 +137,9 @@ public enum SolrScheme { // meta tags Map metas = html.getMetas(); String robots = metas.get("robots"); - if (robots != null) solrdoc.addField("attr_meta_robots", robots); + if (robots != null) solrdoc.addField("metarobots_t", robots); String generator = metas.get("generator"); - if (generator != null) solrdoc.addField("attr_meta_generator", generator); + if (generator != null) solrdoc.addField("metagenerator_t", generator); // bold, italic String[] bold = html.getBold(); diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index adc0faf15..c8c20d8ba 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -26,6 +26,7 @@ package net.yacy.cora.services.federated.solr; import java.io.File; import java.io.IOException; +import java.net.InetAddress; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; @@ -41,8 +42,11 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.document.Document; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -189,11 +193,10 @@ public class SolrSingleConnector { */ public void add(String id, ResponseHeader header, Document doc) throws IOException { - add(id, header, doc, this.scheme); + add(this.scheme.yacy2solr(id, header, doc)); } - - public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException { - SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc); + + private void add(SolrInputDocument solrdoc) throws IOException { int thisrrc = this.transmissionRoundRobinCounter; int nextrrc = thisrrc++; if (nextrrc >= transmissionQueueCount) nextrrc = 0; @@ -223,6 +226,28 @@ public class SolrSingleConnector { } } + public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException { + + SolrInputDocument solrdoc = new SolrInputDocument(); + solrdoc.addField("id", UTF8.String(digestURI.hash())); + solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); + InetAddress address = Domains.dnsResolve(digestURI.getHost()); + if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); + if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); + + // path elements of link + String path = digestURI.getPath(); + if (path != null) { + String[] paths = path.split("/"); + if (paths.length > 0) solrdoc.addField("attr_paths", paths); + } + + solrdoc.addField("failreason_t", failReason); + solrdoc.addField("httpstatus_i", httpstatus); + + add(solrdoc); + } + private void flushTransmissionQueue(int idx) throws IOException { Collection c = new ArrayList(); while (this.transmissionQueue[idx].size() > 0) {