mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a check in zim importer which tests if import URLs actually exist
This commit is contained in:
parent
496f768c44
commit
70e29937ef
|
@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.BitSet;
|
||||
|
@ -2578,6 +2579,32 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
return null;
|
||||
}
|
||||
|
||||
public boolean exists(final ClientIdentification.Agent agent) {
|
||||
try {
|
||||
if (isFile()) {
|
||||
return getFSFile().exists();
|
||||
}
|
||||
if (isSMB()) {
|
||||
return getSmbFile().exists();
|
||||
}
|
||||
if (isFTP()) {
|
||||
final FTPClient client = new FTPClient();
|
||||
client.open(this.host, this.port < 0 ? 21 : this.port);
|
||||
return client.fileSize(path) > 0;
|
||||
}
|
||||
if (isHTTP() || isHTTPS()) {
|
||||
try (final HTTPClient client = new HTTPClient(agent)) {
|
||||
client.setHost(getHost());
|
||||
org.apache.http.HttpResponse response = client.HEADResponse(this, true);
|
||||
return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read fully the source, close it and return its content as a bytes array.
|
||||
* @param source the source to read
|
||||
|
|
|
@ -30,6 +30,7 @@ import java.util.Map;
|
|||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
|
@ -84,6 +85,12 @@ public class ZimImporter extends Thread implements Importer {
|
|||
this.reader = new ZIMReader(this.file);
|
||||
this.guessedSource = getSource(this.reader);
|
||||
|
||||
// verify the source
|
||||
DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
|
||||
DigestURL url = new DigestURL(mainEntry.url);
|
||||
if (!url.exists(ClientIdentification.browserAgent)) return;
|
||||
|
||||
// read all documents
|
||||
for (int i = 0; i < this.file.header_entryCount; i++) {
|
||||
if (this.abort) break;
|
||||
DirectoryEntry de = this.reader.getDirectoryInfo(i);
|
||||
|
@ -304,7 +311,9 @@ public class ZimImporter extends Thread implements Importer {
|
|||
System.out.println("guessed domain: " + guessDomainName(f.getName()));
|
||||
String source = getSource(r);
|
||||
System.out.println("guessed Source: " + source);
|
||||
System.out.println("guessed main article: " + guessURL(source, de));
|
||||
String mainURL = guessURL(source, de);
|
||||
System.out.println("guessed main article: " + mainURL);
|
||||
System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
|
||||
System.out.println();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
|
|
Loading…
Reference in New Issue
Block a user