mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
AJAX Check for robots.txt before crawling.
Icons from herrlich TODO: Style it nicely ;-) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1689 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0b5a736280
commit
62664d7252
|
@ -3,6 +3,8 @@
|
|||
<head>
|
||||
<title>YaCy '#[clientname]#': Index Creation</title>
|
||||
#%env/templates/metas.template%#
|
||||
<script src="/js/ajax.js"></script>
|
||||
<script src="/js/IndexCreate.js"></script>
|
||||
</head>
|
||||
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
|
||||
#%env/templates/header.template%#
|
||||
|
@ -124,7 +126,13 @@ You can define URLs as start points for Web page crawling and start crawling her
|
|||
</tr>
|
||||
<tr><td class="small">From URL:</td>
|
||||
<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
|
||||
<td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
|
||||
<td class="small">
|
||||
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()">
|
||||
<span id="robotsOK"></span>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><span id="title"></span></td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
|
|
BIN
htroot/env/grafics/failed.png
vendored
Normal file
BIN
htroot/env/grafics/failed.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.9 KiB |
BIN
htroot/env/grafics/ok.png
vendored
Normal file
BIN
htroot/env/grafics/ok.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.0 KiB |
|
@ -8,6 +8,6 @@ function handleResponse(){
|
|||
function loadTitle(){
|
||||
url=document.getElementsByName("url")[0].value;
|
||||
if(document.getElementsByName("title")[0].value==""){
|
||||
sndReq('/xml/util/gettitle_p.xml?url='+url);
|
||||
sndReq('/xml/util/getpageinfo_p.xml?actions=title&url='+url);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,36 +51,58 @@ import java.net.URL;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.data.robotsParser;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.http.httpc;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
public class gettitle_p {
|
||||
public class getpageinfo_p {
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
serverObjects prop = new serverObjects();
|
||||
prop.put("title", "");
|
||||
prop.put("robots-allowed", 3); //unknown
|
||||
String actions="title";
|
||||
if(post!=null && post.containsKey("url")){
|
||||
if(post.containsKey("actions"))
|
||||
actions=(String)post.get("actions");
|
||||
ArrayList content;
|
||||
String url;
|
||||
try {
|
||||
url=(String) post.get("url");
|
||||
if(!url.toLowerCase().startsWith("http://")){
|
||||
url="http://"+url;
|
||||
String url=(String) post.get("url");
|
||||
if (!url.toLowerCase().startsWith("http://")) {
|
||||
url = "http://" + url;
|
||||
}
|
||||
if (actions.indexOf("title")>=0) {
|
||||
try {
|
||||
content = httpc.wget(new URL(url));
|
||||
|
||||
Iterator it = content.iterator();
|
||||
String line;
|
||||
String title;
|
||||
while (it.hasNext()) {
|
||||
line = (String) it.next();
|
||||
try {
|
||||
title = line.substring(line.toLowerCase().indexOf(
|
||||
"<title>") + 7, line.toLowerCase().indexOf(
|
||||
"</title>"));
|
||||
prop.put("title", title);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
}
|
||||
}
|
||||
|
||||
} catch (MalformedURLException e) {
|
||||
} catch (IOException e) {
|
||||
}
|
||||
content = httpc.wget(new URL(url));
|
||||
Iterator it=content.iterator();
|
||||
String line;
|
||||
String title;
|
||||
while(it.hasNext()){
|
||||
line=(String) it.next();
|
||||
try{
|
||||
title=line.substring(line.toLowerCase().indexOf("<title>")+7, line.toLowerCase().indexOf("</title>"));
|
||||
prop.put("title", title);
|
||||
return prop;
|
||||
}catch(IndexOutOfBoundsException e){}
|
||||
}
|
||||
} catch (MalformedURLException e) {} catch (IOException e) {}
|
||||
}
|
||||
if(actions.indexOf("robots")>=0){
|
||||
try {
|
||||
if(robotsParser.isDisallowed(new URL(url))){
|
||||
prop.put("robots-allowed", 0);
|
||||
}else{
|
||||
prop.put("robots-allowed", 1);
|
||||
}
|
||||
} catch (MalformedURLException e) {}
|
||||
}
|
||||
|
||||
}
|
||||
// return rewrite properties
|
||||
return prop;
|
5
htroot/xml/util/getpageinfo_p.xml
Normal file
5
htroot/xml/util/getpageinfo_p.xml
Normal file
|
@ -0,0 +1,5 @@
|
|||
<?xml version='1.0' standalone='yes'?>
|
||||
<pageinfo>
|
||||
<title>#[title]#</title>
|
||||
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
|
||||
</pageinfo>
|
|
@ -1,2 +0,0 @@
|
|||
<?xml version='1.0' standalone='yes'?>
|
||||
<title>#[title]#</title>
|
Loading…
Reference in New Issue
Block a user