*) First version of urlRedirector.pl script

- with this script it's possible to pass URLs from squid
     to yacy via the squid redirector interface
   - this URLs are then used by YaCy to feed the crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1141 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2005-11-29 12:27:03 +00:00
parent bdf30117c1
commit b35c5a48bf
4 changed files with 324 additions and 3 deletions

View File

@ -227,7 +227,7 @@ public final class plasmaCrawlStacker {
*/
URL nexturl = null;
if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash;
String referrerHash = plasmaURL.urlHash(referrerString);
String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString);
try {
nexturl = new URL(nexturlString);
} catch (MalformedURLException e) {

View File

@ -79,6 +79,7 @@ import org.apache.commons.pool.impl.GenericObjectPool.Config;
import de.anomic.http.httpc;
import de.anomic.icap.icapd;
import de.anomic.server.logging.serverLog;
import de.anomic.urlRedirector.urlRedirectord;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
@ -1042,12 +1043,15 @@ public final class serverCore extends serverAbstractThread implements serverThre
// now we need to initialize the session
if (this.commandCounter == 0) {
// first we need to determine the proper protocol handler
if (this.request.indexOf("ICAP") >= 0) reqProtocol = "ICAP";
else reqProtocol = "HTTP";
if (this.request.indexOf("ICAP") >= 0) reqProtocol = "ICAP";
else if (this.request.startsWith("REDIRECTOR")) reqProtocol = "REDIRECTOR";
else reqProtocol = "HTTP";
// next we need to get the proper protocol handler
if (reqProtocol.equals("ICAP")) {
this.commandObj = new icapd();
} else if (reqProtocol.equals("REDIRECTOR")) {
this.commandObj = new urlRedirectord();
} else {
// if ((this.commandObj != null) &&
// (this.commandObj.getClass().getName().equals(serverCore.this.handlerPrototype.getClass().getName()))) {

View File

@ -0,0 +1,131 @@
#!/usr/bin/perl -w
#
# This is an URL Redirector Script for squid that can be
# used to bundle YaCy and Squid together via the squid
# redirector support.
# See: http://www.squid-cache.org/Doc/FAQ/FAQ-15.html
#
# This scripts forwards URLs from squid to YaCy where the
# URLs are used to download and index the content of the URLs.
use strict;
use Socket qw(:DEFAULT :crlf);
use IO::Handle;
use Digest::MD5;
# setting administrator username + pwd, hostname + port
my $user = "admin";
my $pwd = "";
my $host = "localhost";
my $port = "8080";
my %mediaExt;
my @requestData;
$|=1;
sub isCGI {
my $url = lc shift;
return ((rindex $url, ".cgi") != -1) ||
((rindex $url, ".exe") != -1) ||
((rindex $url, ";jsessionid=") != -1) ||
((rindex $url, "sessionid/") != -1) ||
((rindex $url, "phpsessid=") != -1);
}
sub isPOST {
my $url = lc shift;
return ((rindex $url, "?") != -1) ||
((rindex $url, "&") != -1);
}
sub isMediaExt {
my $url = lc shift;
my $pos = rindex $url, ".";
if ($pos != -1) {
my $ext = substr($url,$pos+1,length($url));
return exists($mediaExt{$ext});
}
return 0;
}
my ($bytes_out,$bytes_in) = (0,0);
my ($msg_in,$msg_out);
my $protocol = getprotobyname('tcp');
$host = inet_aton($host) or die "$host: unknown host";
socket(SOCK, AF_INET, SOCK_STREAM, $protocol) or die "socket() failed: $!";
my $dest_addr = sockaddr_in($port,$host);
connect(SOCK,$dest_addr) or die "connect() failed: $!";
# enabling autoflush
SOCK->autoflush(1);
# sending the REDIRECTOR command to yacy to enable the proper
# command handler
print SOCK "REDIRECTOR".CRLF;
# Doing authentication
my $ctx = Digest::MD5->new;
$ctx->add($user.":".$pwd);
my $md5Pwd = $ctx->hexdigest;
print SOCK "USER ".$user.CRLF;
print SOCK "PWD ".$md5Pwd.CRLF;
# Getting a list of file extensions that should be ignored
print SOCK "MEDIAEXT".CRLF;
$msg_in = lc <SOCK>;
%mediaExt = split(/,\s*/, $msg_in);
# 1) Reading URLs from stdIn
# 2) Send it to Yacy
# 3) Receive response from YaCy
# 4) Print response to StdOut
while (defined($msg_out = <>)) {
chomp $msg_out;
# splitting request into it's various parts
#
# One squid redirector request line typically looks like this:
# http://www.pageresource.com/styles/tuts.css 192.168.0.5/- - GET
@requestData = split(/\s+/, $msg_out);
# testing if the URL is CGI
if (isCGI($requestData[0])) {
print STDOUT "URL is cgi: ".$msg_out.CRLF;
next;
}
# testing if the URL is a POST request
if (isPOST($requestData[0])){
print STDOUT "URL is post: ".$msg_out.CRLF;
next;
}
# testing if the requested content is a media content
if (isMediaExt($requestData[0])) {
print STDOUT "URL has media extension: ".$msg_out.CRLF;
next;
}
# sending the whole request line to YaCy
$msg_out .= CRLF;
print SOCK $msg_out;
# reading the response
if (defined($msg_in = <SOCK>)) {
print STDOUT $msg_in;
} else {
close SOCK;
exit(1);
}
$bytes_out += length($msg_out);
$bytes_in += length($msg_in);
}
print SOCK "EXIT".CRLF;
close SOCK;
print STDERR "bytes_sent = $bytes_out, bytes_received = $bytes_in\n";

View File

@ -0,0 +1,186 @@
package de.anomic.urlRedirector;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import de.anomic.data.userDB;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverHandler;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverCore.Session;
import de.anomic.yacy.yacyCore;
public class urlRedirectord implements serverHandler {
private serverCore.Session session;
private static plasmaSwitchboard switchboard = null;
private serverLog theLogger = new serverLog("URL-REDIRECTOR");
private static plasmaCrawlProfile.entry profile = null;
public urlRedirectord() {
if (switchboard == null) {
switchboard = plasmaSwitchboard.getSwitchboard();
}
if (profile == null) {
try {
profile = switchboard.profiles.newEntry(
// name
"URL Redirector",
// start URL
"",
// crawling filter
".*",
".*",
// depth
0,
0,
// crawlDynamic
false,
// storeHTCache
false,
// storeTxCache
true,
//localIndexing
true,
// remoteIndexing
false,
// xsstopw
true,
// xdstopw
true,
// xpstopw
true
);
} catch (IOException e) {
this.theLogger.logSevere("Unable to create a crawling profile for the URL-Redirector",e);
}
}
}
public void initSession(Session theSession){
// getting current session
this.session = theSession;
}
public String greeting() {
return null;
}
public String error(Throwable e) {
return null;
}
public Object clone() {
return null;
}
public void reset() {
this.session = null;
}
public Boolean EMPTY(String arg) throws IOException {
return null;
}
public Boolean UNKNOWN(String requestLine) throws IOException {
return null;
}
public Boolean REDIRECTOR(String requestLine) throws IOException {
try {
boolean authenticated = false;
String userName = null;
String md5Pwd = null;
// setting timeout
this.session.controlSocket.setSoTimeout(0);
String line = null;
BufferedReader inputReader = new BufferedReader(new InputStreamReader(this.session.in));
PrintWriter outputWriter = new PrintWriter(this.session.out);
while ((line = inputReader.readLine()) != null) {
if (line.equals("EXIT")) {
break;
} else if (line.startsWith("#")) {
continue;
} else if (line.startsWith("USER")) {
userName = line.substring(line.indexOf(" ")).trim();
} else if (line.startsWith("PWD")) {
if (userName != null) {
userDB.Entry userEntry = switchboard.userDB.getEntry(userName);
if (userEntry != null) {
md5Pwd = line.substring(line.indexOf(" ")).trim();
if (userEntry.getMD5EncodedUserPwd().equals(md5Pwd)) {
authenticated = true;
}
}
}
} else if (line.startsWith("MEDIAEXT")) {
String transferIgnoreList = plasmaParser.getMediaExtList();
transferIgnoreList = transferIgnoreList.substring(1,transferIgnoreList.length()-1);
outputWriter.print(transferIgnoreList);
outputWriter.print("\r\n");
outputWriter.flush();
} else {
if (!authenticated) {
return Boolean.FALSE;
}
int pos = line.indexOf(" ");
String nextURL = (pos != -1) ? line.substring(0,pos):line;
this.theLogger.logFine("Receiving request " + line);
outputWriter.print("\r\n");
outputWriter.flush();
String reasonString = null;
try {
if (plasmaParser.supportedFileExt(new URL(nextURL))) {
// enqueuing URL for crawling
reasonString = switchboard.sbStackCrawlThread.stackCrawl(
nextURL,
null,
yacyCore.seedDB.mySeed.hash,
"URL Redirector",
new Date(),
0,
profile
);
} else {
reasonString = "Unsupporte file extension";
}
} catch (MalformedURLException badUrlEx) {
reasonString = "Malformed URL";
}
if (reasonString != null) {
this.theLogger.logFine("URL " + nextURL + " rejected. Reason: " + reasonString);
}
}
}
this.theLogger.logFine("Connection terminated");
// Terminating connection
return serverCore.TERMINATE_CONNECTION;
} catch (Exception e) {
this.theLogger.logSevere("Unexpected Error: " + e.getMessage(),e);
return serverCore.TERMINATE_CONNECTION;
}
}
}