- refactoring of robots

- added option to crawler to send error-URLs to solr - changed solr scheme slightly (no multi-value fields where no multi values are) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7693 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2011-05-02 14:05:51 +00:00 · 2011-05-02 14:05:51 +00:00 · 6fa439c82b
commit 6fa439c82b
parent 1ea0bc775c
20 changed files with 156 additions and 124 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -385,7 +385,7 @@ public class Crawler_p {
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
-                                reasonString);
+                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
--- a/htroot/WebStructurePicture_p.java
+++ b/htroot/WebStructurePicture_p.java
@ -105,7 +105,7 @@ public class WebStructurePicture_p {
        } else {
            // find start hash
            String hash = null;
-            try {
+            if (host != null && host.length() > 0) try {
                hash = UTF8.String((new DigestURI("http://" + host)).hash(), 6, 6);
            } catch (final MalformedURLException e) {Log.logException(e);}
            //assert (sb.webStructure.outgoingReferences(hash) != null);
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -9,7 +9,7 @@ import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;

 import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.RobotsEntry;
+import de.anomic.crawler.RobotsTxtEntry;
 import de.anomic.search.Switchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -106,7 +106,7 @@ public class getpageinfo_p {
                    final DigestURI theURL = new DigestURI(url);
                    
                	// determine if crawling of the current URL is allowed
-                    RobotsEntry robotsEntry;
+                    RobotsTxtEntry robotsEntry;
                    try {
                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                    } catch (IOException e) {
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@ -162,7 +162,7 @@ public final class crawlReceipt {
                youare.getBytes(),
                null,
                0,
-                result + ":" + reason);
+                result + ":" + reason, -1);
        //switchboard.noticeURL.remove(receivedUrlhash);
        prop.put("delay", "3600");
        return prop;
--- a/htroot/yacy/urls.java
+++ b/htroot/yacy/urls.java
@ -85,7 +85,8 @@ public class urls {
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                0,
-                                "client=____________");
+                                "client=____________",
+                                -1);
                
                // create RSS entry
                prop.put("item_" + c + "_title", "");
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -80,8 +80,8 @@ public class CrawlQueues {
        log.logConfig("Starting Crawling Management");
        noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
        FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
-        errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
-        delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
+        errorURL = new ZURL(sb.solrConnector, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
+        delegatedURL = new ZURL(sb.solrConnector, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
    }
    
    public void relocate(final File newQueuePath) {
@ -92,8 +92,8 @@ public class CrawlQueues {
        
        noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
        FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
-        errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
-        delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
+        errorURL = new ZURL(sb.solrConnector, newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
+        delegatedURL = new ZURL(sb.solrConnector, newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
    }
    
    public void close() {
@ -571,7 +571,7 @@ public class CrawlQueues {
            try {
                // checking robots.txt for http(s) resources
                this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
-                RobotsEntry robotsEntry;
+                RobotsTxtEntry robotsEntry;
                if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
                    (robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
                    robotsEntry.isDisallowed(request.url())) {
@ -581,7 +581,7 @@ public class CrawlQueues {
                            UTF8.getBytes(sb.peers.mySeed().hash),
                            new Date(),
                            1,
-                            "denied by robots.txt");
+                            "denied by robots.txt", -1);
                    this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
                } else {
                    // starting a load from the internet
@ -617,7 +617,7 @@ public class CrawlQueues {
                                UTF8.getBytes(sb.peers.mySeed().hash),
                                new Date(),
                                1,
-                                "cannot load: " + result);
+                                "cannot load: " + result, -1);
                        this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
                    } else {
                        this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
@ -629,7 +629,7 @@ public class CrawlQueues {
                        UTF8.getBytes(sb.peers.mySeed().hash),
                        new Date(),
                        1,
-                        e.getMessage() + " - in worker");
+                        e.getMessage() + " - in worker", -1);
                Log.logException(e);
 //                Client.initConnectionManager();
                this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -202,7 +202,7 @@ public final class CrawlStacker {

            // if the url was rejected we store it into the error URL db
            if (rejectReason != null) {
-                nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason);
+                nextQueue.errorURL.push(entry, UTF8.getBytes(peers.mySeed().hash), new Date(), 1, rejectReason, -1);
            }
        } catch (final Exception e) {
            CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -469,9 +469,9 @@ public final class CrawlStacker {
        }

        // deny cgi
-        if (url.isIndividual())  {
+        if (url.isIndividual() && !(profile.crawlingQ()))  { // TODO: make special property for crawlingIndividual
            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
-            return "cgi url not allowed";
+            return "individual url (sessionid etc) not wanted";
        }

        // deny post properties
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@ -186,7 +186,7 @@ public class Latency {
        // find the delay as given by robots.txt on target site
        long robotsDelay = 0;
        if (!local) {
-            RobotsEntry robotsEntry;
+            RobotsTxtEntry robotsEntry;
            try {
                robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
            } catch (IOException e) {
@ -239,7 +239,7 @@ public class Latency {
        // find the delay as given by robots.txt on target site
        long robotsDelay = 0;
        if (!local) {
-            RobotsEntry robotsEntry;
+            RobotsTxtEntry robotsEntry;
            try {
                robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
            } catch (IOException e) {
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@ -11,16 +11,16 @@
 //Revision: $LastChangedRevision$
 //
 //This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
+//it under the terms of the GNU General public License as published by
 //the Free Software Foundation; either version 2 of the License, or
 //(at your option) any later version.
 //
 //This program is distributed in the hope that it will be useful,
 //but WITHOUT ANY WARRANTY; without even the implied warranty of
 //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//GNU General Public License for more details.
+//GNU General public License for more details.
 //
-//You should have received a copy of the GNU General Public License
+//You should have received a copy of the GNU General public License
 //along with this program; if not, write to the Free Software
 //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

@ -51,15 +51,15 @@ public class RobotsTxt {
    
    private static Logger log = Logger.getLogger(RobotsTxt.class);

-    public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
-    public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
+    protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
+    protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
    
    BEncodedHeap robotsTable;
    private final ConcurrentHashMap<String, DomSync> syncObjects;
    //private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
    
    private static class DomSync {
-    	public DomSync() {}
+    	private DomSync() {}
    }
    
    public RobotsTxt(final BEncodedHeap robotsTable) {
@ -78,16 +78,16 @@ public class RobotsTxt {
        return this.robotsTable.size();
    }
    
-    public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
+    public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
        if (theURL == null) throw new IllegalArgumentException();
        if (!theURL.getProtocol().startsWith("http")) return null;
        return getEntry(theURL, thisAgents, true);
    }
    
-    private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
+    private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
            // this method will always return a non-null value
        String urlHostPort = getHostPort(theURL);
-        RobotsEntry robotsTxt4Host = null;
+        RobotsTxtEntry robotsTxt4Host = null;
        Map<String, byte[]> record;
        try {
            record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
@ -95,7 +95,7 @@ public class RobotsTxt {
            log.warn("memory exhausted", e);
            record = null;
        }
-        if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
+        if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
        
        if (fetchOnlineIfNotAvailableOrNotFresh && (
             robotsTxt4Host == null || 
@ -123,7 +123,7 @@ public class RobotsTxt {
                    log.warn("memory exhausted", e);
                    record = null;
                }
-                if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
+                if (record != null) robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
                if (robotsTxt4Host != null &&
                    robotsTxt4Host.getLoadedDate() != null &&
                    System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
@ -160,7 +160,7 @@ public class RobotsTxt {
                    // no robots.txt available, make an entry to prevent that the robots loading is done twice
                    if (robotsTxt4Host == null) {
                        // generate artificial entry
-                        robotsTxt4Host = new RobotsEntry(
+                        robotsTxt4Host = new RobotsTxtEntry(
                                robotsURL, 
                                new ArrayList<String>(), 
                                new ArrayList<String>(), 
@ -183,7 +183,7 @@ public class RobotsTxt {
                    	addEntry(robotsTxt4Host);
                    }
                } else {
-                    final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
+                    final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
                    ArrayList<String> denyPath = parserResult.denyList();
                    if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
                        denyPath = new ArrayList<String>();
@ -208,7 +208,7 @@ public class RobotsTxt {
        return robotsTxt4Host;
    }
    
-    private RobotsEntry addEntry(
+    private RobotsTxtEntry addEntry(
    		final MultiProtocolURI theURL, 
    		final ArrayList<String> allowPathList, 
    		final ArrayList<String> denyPathList, 
@ -219,7 +219,7 @@ public class RobotsTxt {
    		final long crawlDelayMillis,
    		final String agentName
    ) {
-        final RobotsEntry entry = new RobotsEntry(
+        final RobotsTxtEntry entry = new RobotsTxtEntry(
                                theURL, allowPathList, denyPathList,
                                loadedDate, modDate,
                                eTag, sitemap, crawlDelayMillis, agentName);
@ -227,7 +227,7 @@ public class RobotsTxt {
        return entry;
    }
    
-    private String addEntry(final RobotsEntry entry) {
+    private String addEntry(final RobotsTxtEntry entry) {
        // writes a new page and returns key
        try {
            this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
@ -240,10 +240,10 @@ public class RobotsTxt {
    
    // methods that had been in robotsParser.java:
    
-    public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
-    public static final int DOWNLOAD_ROBOTS_TXT = 1;
-    public static final int DOWNLOAD_ETAG = 2;
-    public static final int DOWNLOAD_MODDATE = 3;
+    private static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
+    private static final int DOWNLOAD_ROBOTS_TXT = 1;
+    private static final int DOWNLOAD_ETAG = 2;
+    private static final int DOWNLOAD_MODDATE = 3;
    
    static final String getHostPort(final MultiProtocolURI theURL) {
        String urlHostPort = null;
@ -267,7 +267,7 @@ public class RobotsTxt {
        return port;
    }

-    private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
+    private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsTxtEntry entry) throws Exception {
        if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
        
        if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
--- a/source/de/anomic/crawler/RobotsTxtEntry.java
+++ b/source/de/anomic/crawler/RobotsTxtEntry.java
@ -13,16 +13,16 @@
 //Revision: $LastChangedRevision$
 //
 //This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
+//it under the terms of the GNU General public License as published by
 //the Free Software Foundation; either version 2 of the License, or
 //(at your option) any later version.
 //
 //This program is distributed in the hope that it will be useful,
 //but WITHOUT ANY WARRANTY; without even the implied warranty of
 //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//GNU General Public License for more details.
+//GNU General public License for more details.
 //
-//You should have received a copy of the GNU General Public License
+//You should have received a copy of the GNU General public License
 //along with this program; if not, write to the Free Software
 //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

@ -41,25 +41,25 @@ import net.yacy.cora.document.UTF8;
 import net.yacy.kelondro.util.ByteArray;


-public class RobotsEntry {
+public class RobotsTxtEntry {
    
-    public static final String HOST_NAME          = "hostname";
-    public static final String ALLOW_PATH_LIST    = "allow";
-    public static final String DISALLOW_PATH_LIST = "disallow";
-    public static final String LOADED_DATE        = "date";
-    public static final String MOD_DATE           = "modDate";
-    public static final String ETAG               = "etag";
-    public static final String SITEMAP            = "sitemap";
-    public static final String CRAWL_DELAY        = "crawlDelay";
-    public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
-    public static final String AGENT_NAME         = "agentname";
+    private static final String HOST_NAME          = "hostname";
+    private static final String ALLOW_PATH_LIST    = "allow";
+    private static final String DISALLOW_PATH_LIST = "disallow";
+    private static final String LOADED_DATE        = "date";
+    private static final String MOD_DATE           = "modDate";
+    private static final String ETAG               = "etag";
+    private static final String SITEMAP            = "sitemap";
+    private static final String CRAWL_DELAY        = "crawlDelay";
+    private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
+    private static final String AGENT_NAME         = "agentname";
    
    // this is a simple record structure that holds all properties of a single crawl start
    private final Map<String, byte[]> mem;
    private final List<String> allowPathList, denyPathList;
    private final String hostName, agentName;
    
-    public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
+    protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
        this.hostName = hostName.toLowerCase();
        this.mem = mem; 
        
@ -90,7 +90,7 @@ public class RobotsEntry {
        this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
    }  
    
-    public RobotsEntry(
+    protected RobotsTxtEntry(
            final MultiProtocolURI theURL, 
            final List<String> allowPathList, 
            final List<String> disallowPathList, 
@ -140,15 +140,15 @@ public class RobotsEntry {
        }
    }
    
-    public String getHostName() {
+    protected String getHostName() {
        return this.hostName;
    }
    
-    public String getAgentName() {
+    protected String getAgentName() {
        return this.agentName;
    }
    
-    public Map<String, byte[]> getMem() {
+    protected Map<String, byte[]> getMem() {
        if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
        return this.mem;
    }
@ -175,34 +175,34 @@ public class RobotsEntry {
        }
    }
    
-    public Date getLoadedDate() {
+    protected Date getLoadedDate() {
        if (this.mem.containsKey(LOADED_DATE)) {
            return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
        }
        return null;
    }
    
-    public void setLoadedDate(final Date newLoadedDate) {
+    protected void setLoadedDate(final Date newLoadedDate) {
        if (newLoadedDate != null) {
            this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
        }
    }
    
-    public Date getModDate() {
+    protected Date getModDate() {
        if (this.mem.containsKey(MOD_DATE)) {
            return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
        }
        return null;
    }        
    
-    public String getETag() {
+    protected String getETag() {
        if (this.mem.containsKey(ETAG)) {
            return UTF8.String(this.mem.get(ETAG));
        }
        return null;
    }          
    
-    public long getCrawlDelayMillis() {
+    protected long getCrawlDelayMillis() {
        if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
            return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
        } catch (final NumberFormatException e) {
--- a/source/de/anomic/crawler/RobotsTxtParser.java
+++ b/source/de/anomic/crawler/RobotsTxtParser.java
@ -10,16 +10,16 @@
  Revision: $LastChangedRevision$
  
  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+  GNU General public License for more details.
  
-  You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General private License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

@ -59,16 +59,16 @@ import java.util.regex.Pattern;
 *        See: http://www.kollar.com/robots.html
 */

-public final class robotsParser {
+public final class RobotsTxtParser {
    
    private static final Pattern patternTab = Pattern.compile("\t");
    
-	public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
-    public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
-    public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
-    public static final String ROBOTS_COMMENT = "#";
-    public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
-    public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
+	private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
+    private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
+    private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
+    private static final String ROBOTS_COMMENT = "#";
+    private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
+    private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
    
    private final ArrayList<String> allowList;
    private final ArrayList<String> denyList;
@ -77,7 +77,7 @@ public final class robotsParser {
    private final Set<String> myNames; // a list of own name lists
    private       String agentName; // the name of the agent that was used to return the result
    
-    public robotsParser(final byte[] robotsTxt, final Set<String> myNames) {
+    protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
        this.allowList = new ArrayList<String>(0);
        this.denyList = new ArrayList<String>(0);
        this.sitemap = "";
@ -91,16 +91,6 @@ public final class robotsParser {
        }
    }
    
-    public robotsParser(final BufferedReader reader, final Set<String> myNames) {
-        this.allowList = new ArrayList<String>(0);
-        this.denyList = new ArrayList<String>(0);
-        this.sitemap = "";
-        this.crawlDelayMillis = 0;
-        this.myNames = myNames;
-        this.agentName = null;
-        if (reader != null) parse(reader);
-    }
-    
    private void parse(final BufferedReader reader) {
        final ArrayList<String> deny4AllAgents = new ArrayList<String>();
        final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
@ -260,7 +250,7 @@ public final class robotsParser {
     * does not make any no-DOS-forced crawl pause.
     * @return the crawl delay between two crawl access times in milliseconds
     */
-    public long crawlDelayMillis() {
+    protected long crawlDelayMillis() {
        return this.crawlDelayMillis;
    }
    
@ -271,19 +261,19 @@ public final class robotsParser {
     * Effects: see also comment to crawlDelayMillis()
     * @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
     */
-    public String agentName() {
+    protected String agentName() {
        return this.agentName;
    }
    
-    public String sitemap() {
+    protected String sitemap() {
        return this.sitemap;
    }
    
-    public ArrayList<String> allowList() {
+    protected ArrayList<String> allowList() {
        return this.allowList;
    }
    
-    public ArrayList<String> denyList() {
+    protected ArrayList<String> denyList() {
        return this.denyList;
    }
 }
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
@ -34,6 +34,7 @@ import java.util.Iterator;
 import java.util.concurrent.ConcurrentLinkedQueue;

 import net.yacy.cora.document.UTF8;
+import net.yacy.cora.services.federated.solr.SolrSingleConnector;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.Index;
@ -66,13 +67,16 @@ public class ZURL implements Iterable<ZURL.Entry> {
    // the class object
    private Index urlIndex;
    private final ConcurrentLinkedQueue<byte[]> stack;
+    private final SolrSingleConnector solrConnector;
    
    public ZURL(
+            final SolrSingleConnector solrConnector,
    		final File cachePath,
    		final String tablename,
    		final boolean startWithEmptyFile,
            final boolean useTailCache,
            final boolean exceed134217727) {
+        this.solrConnector = solrConnector;
        // creates a new ZURL in a file
        cachePath.mkdirs();
        final File f = new File(cachePath, tablename);
@ -94,7 +98,8 @@ public class ZURL implements Iterable<ZURL.Entry> {
        this.stack = new ConcurrentLinkedQueue<byte[]>();
    }
    
-    public ZURL() {
+    public ZURL(final SolrSingleConnector solrConnector) {
+        this.solrConnector = solrConnector;
        // creates a new ZUR in RAM
        this.urlIndex = new RowSet(rowdef);
        this.stack = new ConcurrentLinkedQueue<byte[]>();
@ -126,14 +131,24 @@ public class ZURL implements Iterable<ZURL.Entry> {
            final byte[] executor,
            final Date workdate,
            final int workcount,
-            String anycause) {
+            String anycause,
+            int httpcode) {
        // assert executor != null; // null == proxy !
        if (exists(bentry.url().hash())) return; // don't insert double causes
        if (anycause == null) anycause = "unknown";
-        Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
+        String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
+        Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
        put(entry);
        stack.add(entry.hash());
-        Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + anycause);
+        Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
+        if (this.solrConnector != null) {
+            // send the error to solr
+            try {
+                this.solrConnector.err(bentry.url(), reason, httpcode);
+            } catch (IOException e) {
+                Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
+            }
+        }
        while (stack.size() > maxStackSize) stack.poll();
    }
    
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -152,7 +152,7 @@ public class FTPLoader {
        if (berr.size() > 0 || response == null) {
            // some error logging
            final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
-            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
+            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail, -1);
            throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
        }
        
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -78,7 +78,7 @@ public final class HTTPLoader {
    private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {

        if (retryCount < 0) {
-            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
+            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded", -1);
            throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
        }
        
@ -94,7 +94,7 @@ public final class HTTPLoader {
        // check if url is in blacklist
        final String hostlow = host.toLowerCase();
        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
-            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
+            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist", -1);
            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
        }
        
@ -138,7 +138,7 @@ public final class HTTPLoader {
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
-                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy");
+                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }
                    
@ -151,14 +151,14 @@ public final class HTTPLoader {

                    // if we are already doing a shutdown we don't need to retry crawling
                    if (Thread.currentThread().isInterrupted()) {
-                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown");
+                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown", code);
                        throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
                    }
                    
                    // check if the url was already indexed
                    final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
                    if (dbname != null) {
-                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content");
+                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
                    }
                    
@ -167,12 +167,12 @@ public final class HTTPLoader {
                    return load(request, retryCount - 1, maxFileSize, checkBlacklist);
                } else {
                	// no redirection url provided
-                    sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");
+                    sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided", code);
                    throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
                }
            } else if (responseBody == null) {
        	    // no response, reject file
-                sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (code = " + code + ")");
+                sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body", code);
                throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
        	} else if (code == 200 || code == 203) {
                // the transfer is ok
@ -183,7 +183,7 @@ public final class HTTPLoader {

                // check length again in case it was not possible to get the length before loading
                if (maxFileSize > 0 && contentLength > maxFileSize) {
-                	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");                    
+                	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded", code);                    
                	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
                }

@ -201,7 +201,7 @@ public final class HTTPLoader {
                return response;
        	} else {
                // if the response has not the right response type then reject file
-            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
+            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code", code);
                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
            }
    }
--- a/source/de/anomic/data/ymark/YMarkEntry.java
+++ b/source/de/anomic/data/ymark/YMarkEntry.java
@ -133,7 +133,8 @@ public class YMarkEntry extends TreeMap<String, String> {
        	case DATE_MODIFIED:
        	case DATE_VISITED:
        		this.put(b.key(), String.valueOf(System.currentTimeMillis()));
-    		default:
+    		    break;
+            default:
    			break;
        }        
    }
--- a/source/de/anomic/data/ymark/YMarkTables.java
+++ b/source/de/anomic/data/ymark/YMarkTables.java
@ -112,7 +112,7 @@ public class YMarkTables {
    	this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
    }
    
-    public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) throws IOException {
+    public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) {
    	final TreeMap<String,YMarkTag> tags = new TreeMap<String,YMarkTag>();
    	Tables.Row bmk_row = null;
    	Iterator<String> tit = null;
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -305,7 +305,7 @@ public final class HTTPDFileHandler {
            final boolean accountEmpty = adminAccountBase64MD5.length() == 0;
            final boolean softauth = accessFromLocalhost && authorization != null && authorization.length() > 6 && (adminAccountBase64MD5.equals(authorization.substring(6)));

-            if (protectedPage && ((!softauth && !grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
+            if (protectedPage && !softauth && ((!grantedForLocalhost && !accountEmpty) || requestHeader.userAgent().startsWith("yacybot"))) {
                // authentication required
                if (authorization == null) {
                    // no authorization given in response. Ask for that
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -523,6 +523,11 @@ public final class Switchboard extends serverSwitch {
        log.logConfig("Parser: Initializing Mime Type deny list");
        TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
        
+        // set up the solr interface
+        String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
+        boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
+        this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
+        
        // start a loader
        log.logConfig("Starting Crawl Loader");
        this.loader = new LoaderDispatcher(this);
@ -605,11 +610,6 @@ public final class Switchboard extends serverSwitch {
            }
        }
        
-        // set up the solr interface
-        String solrurl = this.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
-        boolean usesolr = this.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurl.length() > 0;
-        this.solrConnector = (usesolr) ? new SolrSingleConnector(solrurl, SolrScheme.SolrCell) : null;
-        
        // initializing dht chunk generation
        this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
        
@ -2423,7 +2423,7 @@ public final class Switchboard extends serverSwitch {
                0, 
                0,
                0);
-        crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
+        crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason, -1);
    }
    
    public final void heuristicSite(final SearchEvent searchEvent, final String host) {
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -59,8 +59,8 @@ public enum SolrScheme {
        solrdoc.addField("id", id);
        solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
        InetAddress address = Domains.dnsResolve(digestURI.getHost());
-        if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
-        if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
+        if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
+        if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
        solrdoc.addField("title", yacydoc.dc_title());
        solrdoc.addField("author", yacydoc.dc_creator());
        solrdoc.addField("description", yacydoc.dc_description());
@ -68,7 +68,7 @@ public enum SolrScheme {
        solrdoc.addField("last_modified", header.lastModified());
        solrdoc.addField("keywords", yacydoc.dc_subject(' '));
        String content = UTF8.String(yacydoc.getTextBytes());
-        solrdoc.addField("attr_text", content);
+        solrdoc.addField("text_t", content);
        int contentwc = content.split(" ").length;
        solrdoc.addField("wordcount_i", contentwc);

@ -111,14 +111,14 @@ public enum SolrScheme {
        solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray());
        
        // charset
-        solrdoc.addField("attr_charset", yacydoc.getCharset());
+        solrdoc.addField("charset_s", yacydoc.getCharset());

        // coordinates
        if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
            solrdoc.addField("lon_coordinate", yacydoc.lon());
            solrdoc.addField("lat_coordinate", yacydoc.lat());
        }
-        solrdoc.addField("attr_httpstatus", "200");
+        solrdoc.addField("httpstatus_i", 200);
        Object parser = yacydoc.getParserObject();
        if (parser instanceof ContentScraper) {
            ContentScraper html = (ContentScraper) parser;
@ -137,9 +137,9 @@ public enum SolrScheme {
            // meta tags
            Map<String, String> metas = html.getMetas();
            String robots = metas.get("robots");
-            if (robots != null) solrdoc.addField("attr_meta_robots", robots);
+            if (robots != null) solrdoc.addField("metarobots_t", robots);
            String generator = metas.get("generator");
-            if (generator != null) solrdoc.addField("attr_meta_generator", generator);
+            if (generator != null) solrdoc.addField("metagenerator_t", generator);
            
            // bold, italic
            String[] bold = html.getBold();
--- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java
@ -26,6 +26,7 @@ package net.yacy.cora.services.federated.solr;

 import java.io.File;
 import java.io.IOException;
+import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collection;
@ -41,8 +42,11 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;

+import net.yacy.cora.document.UTF8;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.document.Document;
+import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;


@ -189,11 +193,10 @@ public class SolrSingleConnector {
    */
    
    public void add(String id, ResponseHeader header, Document doc) throws IOException {
-        add(id, header, doc, this.scheme);
+        add(this.scheme.yacy2solr(id, header, doc));
    }
-    
-    public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException {
-        SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc);
+
+    private void add(SolrInputDocument solrdoc) throws IOException {
        int thisrrc = this.transmissionRoundRobinCounter;
        int nextrrc = thisrrc++;
        if (nextrrc >= transmissionQueueCount) nextrrc = 0;
@ -223,6 +226,28 @@ public class SolrSingleConnector {
        }
    }
    
+    public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException {
+       
+            SolrInputDocument solrdoc = new SolrInputDocument();
+            solrdoc.addField("id", UTF8.String(digestURI.hash()));
+            solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
+            InetAddress address = Domains.dnsResolve(digestURI.getHost());
+            if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
+            if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
+
+            // path elements of link
+            String path = digestURI.getPath();
+            if (path != null) {
+                String[] paths = path.split("/");
+                if (paths.length > 0) solrdoc.addField("attr_paths", paths);
+            }
+
+            solrdoc.addField("failreason_t", failReason);
+            solrdoc.addField("httpstatus_i", httpstatus);
+            
+            add(solrdoc);
+    }
+    
    private void flushTransmissionQueue(int idx) throws IOException {
        Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
        while (this.transmissionQueue[idx].size() > 0) {