fixes to SMB crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6900 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-05-23 01:17:44 +00:00
parent 0eafd94b22
commit 6950d8a33d
4 changed files with 61 additions and 15 deletions

View File

@ -29,10 +29,16 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
@ -72,8 +78,6 @@ public class SMBLoader {
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
if (url.isDirectory()) {
List<String> list = new ArrayList<String>();
String u = url.toNormalform(true, true);
String[] l = url.list();
if (l == null) {
// this can only happen if there is no connection or the directory does not exist
@ -81,7 +85,16 @@ public class SMBLoader {
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
throw new IOException("directory listing not available. URL = " + request.url().toString());
}
for (String s: l) list.add(u + s);
String u = url.toNormalform(true, true);
List<String> list = new ArrayList<String>();
for (String s: l) {
if (!s.endsWith("/") && !s.endsWith("\\")) {
// check if this is a directory
SmbFile sf = new SmbFile(u + s);
if (sf.isDirectory()) s = s + "/";
}
list.add(u + s);
}
StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
@ -147,5 +160,32 @@ public class SMBLoader {
b);
return response;
}
public static void main(String[] args) {
//jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" );
//NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password");
SmbFileInputStream in;
try {
SmbFile sf = new SmbFile(args[0]);
if (sf.isDirectory()) {
String[] s = sf.list();
for (String t: s) System.out.println(t);
} else {
in = new SmbFileInputStream(sf);
byte[] b = new byte[8192];
int n;
while(( n = in.read( b )) > 0 ) {
System.out.write( b, 0, n );
}
}
} catch (SmbException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (UnknownHostException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -2665,7 +2665,7 @@ public class ftpc {
entryInfo info;
for (final String line : list) {
info = parseListData(line);
if(info != null) {
if (info != null) {
// with link
nameStart = line.indexOf(info.name);
page.append(line.substring(0, nameStart));
@ -2674,9 +2674,11 @@ public class ftpc {
if (line.length() > nameEnd) {
page.append(line.substring(nameEnd));
}
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://")) {
page.append("<a href=\"" + line + "\">" + line + "</a>");
} else {
// raw
page.append(line);
// raw
page.append(line);
}
page.append('\n');
}

View File

@ -1749,7 +1749,7 @@ public final class Switchboard extends serverSwitch {
// process the next hyperlink
nextUrl = nextEntry.getKey();
String u = nextUrl.toNormalform(true, true, true);
if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
if (!(u.startsWith("http") || u.startsWith("ftp") || u.startsWith("smb"))) continue;
// enqueue the hyperlink into the pre-notice-url db
try {
crawlStacker.enqueueEntry(new Request(
@ -1829,13 +1829,13 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || document.indexingDenied()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document");
return;
}
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
return;
}

View File

@ -1037,7 +1037,8 @@ public class DigestURI implements Serializable {
*/
public SmbFile getSmbFile() throws MalformedURLException {
if (!isSMB()) throw new UnsupportedOperationException();
return new SmbFile(this.toNormalform(false, true));
String url = this.toNormalform(false, true);
return new SmbFile(url);
}
// some methods that let the DigestURI look like a java.io.File object
@ -1162,10 +1163,13 @@ public class DigestURI implements Serializable {
public String[] list() {
if (isFile()) return getFSFile().list();
if (isSMB()) try {
return getSmbFile().list();
} catch (SmbException e) {
Log.logWarning("DigestURI", "SMB.list SmbException for " + this.toString() + ": " + e.getMessage());
return null;
SmbFile sf = getSmbFile();
try {
return sf.list();
} catch (SmbException e) {
Log.logWarning("DigestURI", "SMB.list SmbException for " + sf.toString() + ": " + e.getMessage());
return null;
}
} catch (MalformedURLException e) {
Log.logWarning("DigestURI", "SMB.list MalformedURLException for " + this.toString() + ": " + e.getMessage());
return null;