mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fixes to SMB crawler
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6900 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0eafd94b22
commit
6950d8a33d
|
@ -29,10 +29,16 @@ package de.anomic.crawler.retrieval;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import jcifs.smb.SmbException;
|
||||
import jcifs.smb.SmbFile;
|
||||
import jcifs.smb.SmbFileInputStream;
|
||||
|
||||
import de.anomic.http.server.HeaderFramework;
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.http.server.ResponseHeader;
|
||||
|
@ -72,8 +78,6 @@ public class SMBLoader {
|
|||
|
||||
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
|
||||
if (url.isDirectory()) {
|
||||
List<String> list = new ArrayList<String>();
|
||||
String u = url.toNormalform(true, true);
|
||||
String[] l = url.list();
|
||||
if (l == null) {
|
||||
// this can only happen if there is no connection or the directory does not exist
|
||||
|
@ -81,7 +85,16 @@ public class SMBLoader {
|
|||
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
|
||||
throw new IOException("directory listing not available. URL = " + request.url().toString());
|
||||
}
|
||||
for (String s: l) list.add(u + s);
|
||||
String u = url.toNormalform(true, true);
|
||||
List<String> list = new ArrayList<String>();
|
||||
for (String s: l) {
|
||||
if (!s.endsWith("/") && !s.endsWith("\\")) {
|
||||
// check if this is a directory
|
||||
SmbFile sf = new SmbFile(u + s);
|
||||
if (sf.isDirectory()) s = s + "/";
|
||||
}
|
||||
list.add(u + s);
|
||||
}
|
||||
|
||||
StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
|
||||
|
||||
|
@ -147,5 +160,32 @@ public class SMBLoader {
|
|||
b);
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
//jcifs.Config.setProperty( "jcifs.netbios.wins", "192.168.1.220" );
|
||||
//NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication("domain", "username", "password");
|
||||
SmbFileInputStream in;
|
||||
try {
|
||||
SmbFile sf = new SmbFile(args[0]);
|
||||
if (sf.isDirectory()) {
|
||||
String[] s = sf.list();
|
||||
for (String t: s) System.out.println(t);
|
||||
} else {
|
||||
in = new SmbFileInputStream(sf);
|
||||
byte[] b = new byte[8192];
|
||||
int n;
|
||||
while(( n = in.read( b )) > 0 ) {
|
||||
System.out.write( b, 0, n );
|
||||
}
|
||||
}
|
||||
} catch (SmbException e) {
|
||||
e.printStackTrace();
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (UnknownHostException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2665,7 +2665,7 @@ public class ftpc {
|
|||
entryInfo info;
|
||||
for (final String line : list) {
|
||||
info = parseListData(line);
|
||||
if(info != null) {
|
||||
if (info != null) {
|
||||
// with link
|
||||
nameStart = line.indexOf(info.name);
|
||||
page.append(line.substring(0, nameStart));
|
||||
|
@ -2674,9 +2674,11 @@ public class ftpc {
|
|||
if (line.length() > nameEnd) {
|
||||
page.append(line.substring(nameEnd));
|
||||
}
|
||||
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://")) {
|
||||
page.append("<a href=\"" + line + "\">" + line + "</a>");
|
||||
} else {
|
||||
// raw
|
||||
page.append(line);
|
||||
// raw
|
||||
page.append(line);
|
||||
}
|
||||
page.append('\n');
|
||||
}
|
||||
|
|
|
@ -1749,7 +1749,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// process the next hyperlink
|
||||
nextUrl = nextEntry.getKey();
|
||||
String u = nextUrl.toNormalform(true, true, true);
|
||||
if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
|
||||
if (!(u.startsWith("http") || u.startsWith("ftp") || u.startsWith("smb"))) continue;
|
||||
// enqueue the hyperlink into the pre-notice-url db
|
||||
try {
|
||||
crawlStacker.enqueueEntry(new Request(
|
||||
|
@ -1829,13 +1829,13 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
if (condenser == null || document.indexingDenied()) {
|
||||
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
|
||||
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
|
||||
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by rule in document");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
|
||||
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
|
||||
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
|
||||
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, "denied by profile rule");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -1037,7 +1037,8 @@ public class DigestURI implements Serializable {
|
|||
*/
|
||||
public SmbFile getSmbFile() throws MalformedURLException {
|
||||
if (!isSMB()) throw new UnsupportedOperationException();
|
||||
return new SmbFile(this.toNormalform(false, true));
|
||||
String url = this.toNormalform(false, true);
|
||||
return new SmbFile(url);
|
||||
}
|
||||
|
||||
// some methods that let the DigestURI look like a java.io.File object
|
||||
|
@ -1162,10 +1163,13 @@ public class DigestURI implements Serializable {
|
|||
public String[] list() {
|
||||
if (isFile()) return getFSFile().list();
|
||||
if (isSMB()) try {
|
||||
return getSmbFile().list();
|
||||
} catch (SmbException e) {
|
||||
Log.logWarning("DigestURI", "SMB.list SmbException for " + this.toString() + ": " + e.getMessage());
|
||||
return null;
|
||||
SmbFile sf = getSmbFile();
|
||||
try {
|
||||
return sf.list();
|
||||
} catch (SmbException e) {
|
||||
Log.logWarning("DigestURI", "SMB.list SmbException for " + sf.toString() + ": " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
Log.logWarning("DigestURI", "SMB.list MalformedURLException for " + this.toString() + ": " + e.getMessage());
|
||||
return null;
|
||||
|
|
Loading…
Reference in New Issue
Block a user