AJAX Check for robots.txt before crawling.

Icons from herrlich TODO: Style it nicely ;-) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1689 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2006-02-17 20:55:31 +00:00 · 2006-02-17 20:55:31 +00:00 · 62664d7252
commit 62664d7252
parent 0b5a736280
7 changed files with 56 additions and 23 deletions
--- a/htroot/IndexCreate_p.html
+++ b/htroot/IndexCreate_p.html
@ -3,6 +3,8 @@
 <head>
 <title>YaCy '#[clientname]#': Index Creation</title>
 #%env/templates/metas.template%#
+<script src="/js/ajax.js"></script>
+<script src="/js/IndexCreate.js"></script>
 </head>
 <body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
 #%env/templates/header.template%#
@ -124,7 +126,13 @@ You can define URLs as start points for Web page crawling and start crawling her
    	</tr>
    	<tr><td class="small">From&nbsp;URL:</td>
    		<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
-    	    <td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
+    	    <td class="small">
+    	    	<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()">
+    	      	<span id="robotsOK"></span>
+    	    </td>
+    	</tr>
+    	<tr>
+    		<td colspan="2"><span id="title"></span></td>
    	</tr>
    	</table>
    </td>
--- a/htroot/env/grafics/failed.png
+++ b/htroot/env/grafics/failed.png
--- a/htroot/env/grafics/ok.png
+++ b/htroot/env/grafics/ok.png
--- a/htroot/js/Bookmarks.js
+++ b/htroot/js/Bookmarks.js
@ -8,6 +8,6 @@ function handleResponse(){
 function loadTitle(){
 	url=document.getElementsByName("url")[0].value;
 	if(document.getElementsByName("title")[0].value==""){
-		sndReq('/xml/util/gettitle_p.xml?url='+url);
+		sndReq('/xml/util/getpageinfo_p.xml?actions=title&url='+url);
 	}
 }
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@ -51,36 +51,58 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Iterator;

+import de.anomic.data.robotsParser;
 import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;

-public class gettitle_p {
+public class getpageinfo_p {
    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
        serverObjects prop = new serverObjects();
        prop.put("title", "");
+        prop.put("robots-allowed", 3); //unknown
+        String actions="title";
        if(post!=null && post.containsKey("url")){
+            if(post.containsKey("actions"))
+                actions=(String)post.get("actions");
            ArrayList content;
-            String url;
-            try {
-                url=(String) post.get("url");
-                if(!url.toLowerCase().startsWith("http://")){
-                    url="http://"+url;
+            String url=(String) post.get("url");
+            if (!url.toLowerCase().startsWith("http://")) {
+                url = "http://" + url;
+            }
+            if (actions.indexOf("title")>=0) {
+                try {
+                    content = httpc.wget(new URL(url));
+
+                    Iterator it = content.iterator();
+                    String line;
+                    String title;
+                    while (it.hasNext()) {
+                        line = (String) it.next();
+                        try {
+                            title = line.substring(line.toLowerCase().indexOf(
+                                    "<title>") + 7, line.toLowerCase().indexOf(
+                                    "</title>"));
+                            prop.put("title", title);
+                        } catch (IndexOutOfBoundsException e) {
+                        }
+                    }
+
+                } catch (MalformedURLException e) {
+                } catch (IOException e) {
                }
-                content = httpc.wget(new URL(url));
-                Iterator it=content.iterator();
-                String line;
-                String title;
-                while(it.hasNext()){
-                    line=(String) it.next();
-                    try{
-                        title=line.substring(line.toLowerCase().indexOf("<title>")+7, line.toLowerCase().indexOf("</title>"));
-                        prop.put("title", title);
-                        return prop;
-                    }catch(IndexOutOfBoundsException e){}
-                }
-            } catch (MalformedURLException e) {} catch (IOException e) {}
+            }
+            if(actions.indexOf("robots")>=0){
+                try {
+                    if(robotsParser.isDisallowed(new URL(url))){
+                        prop.put("robots-allowed", 0);
+                    }else{
+                        prop.put("robots-allowed", 1);
+                    }
+                } catch (MalformedURLException e) {}
+            }
+            
        }
        // return rewrite properties
        return prop;
--- a/htroot/xml/util/getpageinfo_p.xml
+++ b/htroot/xml/util/getpageinfo_p.xml
@ -0,0 +1,5 @@
+<?xml version='1.0' standalone='yes'?>
+<pageinfo>
+  <title>#[title]#</title>
+  <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
+</pageinfo>
--- a/htroot/xml/util/gettitle_p.xml
+++ b/htroot/xml/util/gettitle_p.xml
@ -1,2 +0,0 @@
-<?xml version='1.0' standalone='yes'?>
-<title>#[title]#</title>