added a check in zim importer which tests if import URLs actually exist

This commit is contained in:
Michael Peter Christen 2023-11-04 19:07:50 +01:00
parent 496f768c44
commit 70e29937ef
2 changed files with 37 additions and 1 deletions

View File

@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.http.HttpResponse;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.BitSet;
@ -2578,6 +2579,32 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return null;
}
public boolean exists(final ClientIdentification.Agent agent) {
try {
if (isFile()) {
return getFSFile().exists();
}
if (isSMB()) {
return getSmbFile().exists();
}
if (isFTP()) {
final FTPClient client = new FTPClient();
client.open(this.host, this.port < 0 ? 21 : this.port);
return client.fileSize(path) > 0;
}
if (isHTTP() || isHTTPS()) {
try (final HTTPClient client = new HTTPClient(agent)) {
client.setHost(getHost());
org.apache.http.HttpResponse response = client.HEADResponse(this, true);
return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
}
}
return false;
} catch (IOException e) {
return false;
}
}
/**
* Read fully the source, close it and return its content as a bytes array.
* @param source the source to read

View File

@ -30,6 +30,7 @@ import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -84,6 +85,12 @@ public class ZimImporter extends Thread implements Importer {
this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader);
// verify the source
DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
DigestURL url = new DigestURL(mainEntry.url);
if (!url.exists(ClientIdentification.browserAgent)) return;
// read all documents
for (int i = 0; i < this.file.header_entryCount; i++) {
if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i);
@ -304,7 +311,9 @@ public class ZimImporter extends Thread implements Importer {
System.out.println("guessed domain: " + guessDomainName(f.getName()));
String source = getSource(r);
System.out.println("guessed Source: " + source);
System.out.println("guessed main article: " + guessURL(source, de));
String mainURL = guessURL(source, de);
System.out.println("guessed main article: " + mainURL);
System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
System.out.println();
} catch (IOException e) {
e.printStackTrace();