added a zim importer that can be used for surrogate imports.

Can not be used yet because it requires some security additions
to verify that the given urls actually work.
This commit is contained in:
Michael Peter Christen 2023-11-01 18:48:40 +01:00
parent b9912ff50d
commit 1c0df28bfb
3 changed files with 339 additions and 7 deletions

View File

@ -0,0 +1,306 @@
/**
* ZimImporter.java
* (C) 2023 by Michael Peter Christen @orbiter
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.TextParser;
import net.yacy.search.Switchboard;
import org.openzim.ZIMFile;
import org.openzim.ZIMReader;
import org.openzim.ZIMReader.ArticleEntry;
import org.openzim.ZIMReader.DirectoryEntry;
/**
* ZIM importer
* can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
* These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
* These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
* For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
* URLs against the actual internet-hosted document. Only if that check succeeds we should import the files.
* In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent
* that such links are shared.
*/
public class ZimImporter extends Thread implements Importer {
static public ZimImporter job;
private ZIMFile file;
private ZIMReader reader;
private String path;
private String guessedSource;
private int recordCnt;
private long startTime;
private final long sourceSize;
private long consumed;
private boolean abort = false;
public ZimImporter(String path) throws IOException {
super("ZimImporter - from file " + path);
this.path = path;
this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time
this.sourceSize = this.file.length();
}
@Override
public void run() {
job = this;
this.startTime = System.currentTimeMillis();
try {
this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader);
for (int i = 0; i < this.file.header_entryCount; i++) {
if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i);
if (!(de instanceof ZIMReader.ArticleEntry)) continue;
ArticleEntry ae = (ArticleEntry) de;
// check url
String guessedUrl = guessURL(this.guessedSource, de);
assert guessedUrl.startsWith("http");
// check availability of text parser
String mimeType = this.file.getMimeType(ae.mimetype);
if (TextParser.supportsMime(mimeType) != null) continue;
// read the content
byte[] b = this.reader.getArticleData(ae);
// create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200);
final Request request = new Request(new DigestURL(guessedUrl), null);
final Response response = new Response(
request,
requestHeader,
responseHeader,
Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
false,
b
);
// throw this to the indexer
String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++;
}
} catch (IOException e) {
ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
}
ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents");
job = null;
}
public void quit() {
this.abort = true;
}
@Override
public String source() {
return this.path;
}
@Override
public int count() {
return this.recordCnt;
}
@Override
public int speed() {
if (this.recordCnt == 0) return 0;
return (int) (this.recordCnt / Math.max(0L, runningTime() ));
}
@Override
public long runningTime() {
return (System.currentTimeMillis() - this.startTime) / 1000L;
}
@Override
public long remainingTime() {
if (this.consumed == 0) {
return 0;
}
long speed = this.consumed / runningTime();
return (this.sourceSize - this.consumed) / speed;
}
@Override
public String status() {
return "";
}
public static String guessDomainName(String fileName) {
if (fileName == null || fileName.isEmpty()) {
return null; // Handle null or empty input
}
String[] parts = fileName.split("_");
if (parts.length == 0) {
return null;
}
String firstPart = parts[0];
// Handling special cases where the domain name might not be obvious
// These are based on your provided list and can be expanded as needed
switch (firstPart) {
case "100r-off-the-grid":
return "100resilientcities.org";
case "armypubs":
return "armypubs.army.mil";
case "artofproblemsolving":
return "artofproblemsolving.com";
case "based":
return "based.cooking";
case "booksdash":
return "booksdash.com";
case "coopmaths":
return "coopmaths.fr";
case "fas-military-medicine":
return "fas.org";
case "fonts":
return "fonts.google.com";
case "gutenberg":
return "gutenberg.org";
case "ifixit":
return "ifixit.com";
case "lesfondamentaux":
return "reseau-canope.fr";
case "lowtechmagazine":
return "lowtechmagazine.com";
case "mutopiaproject":
return "mutopiaproject.org";
case "openstreetmap-wiki":
return "wiki.openstreetmap.org";
case "opentextbooks":
return "opentextbooks.org";
case "phet":
return "phet.colorado.edu";
case "practical_action":
return "practicalaction.org";
case "rapsberry_pi_docs":
return "raspberrypi.org";
case "ted":
return "ted.com";
case "vikidia":
return "vikidia.org";
case "westeros":
return "westeros.org";
case "wikipedia":
return parts[1] + ".wikipedia.org/wiki";
case "www.ready.gov":
return "ready.gov";
}
// Handling domain patterns
if (firstPart.contains(".stackexchange.com")) {
return firstPart;
} else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") ||
firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") ||
firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) {
return firstPart;
} else if (firstPart.contains("-")) {
return firstPart.substring(0, firstPart.indexOf("-"));
}
// Additional general domain extraction logic
if (firstPart.contains(".")) {
int lastDotIndex = firstPart.lastIndexOf('.');
if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) {
// Extract up to the next character beyond the TLD, to support TLDs of variable length
int endIndex = firstPart.indexOf('.', lastDotIndex + 1);
if (endIndex == -1) {
endIndex = firstPart.length();
}
return firstPart.substring(0, endIndex);
}
}
// Default return if none of the above conditions meet
return null;
}
public static String getSource(ZIMReader r) throws IOException {
String source = r.getMetadata("Source");
if (source != null) return source;
source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/";
return source;
}
public static String guessURL(String guessedSource, DirectoryEntry de) {
String url = de.url;
if (url.equals("Main_Page")) url = "";
return guessedSource + url;
}
public static void main(String[] args) {
// zim file import test
// will test mostly if domain names are included in zim file urls
String zimFilesPath = args[0];
File zimFiles = new File(zimFilesPath);
// make ordered file list; order by file size (start with smallest)
String[] filelist = zimFiles.list();
Map<Long, File> orderedFileMap = new TreeMap<>();
for (int i = 0; i < filelist.length; i++) {
if (!filelist[i].endsWith(".zim")) continue;
File f = new File(zimFiles, filelist[i]);
orderedFileMap.put(f.length() * 1000 + i, f);
}
Collection<File> orderedFiles = orderedFileMap.values();
for (File f: orderedFiles) {
try {
ZIMFile z = new ZIMFile(f.getAbsolutePath());
ZIMReader r = new ZIMReader(z);
DirectoryEntry de = r.getMainDirectoryEntry();
System.out.println("ZIM file: " + f.getAbsolutePath());
for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);};
System.out.println("Namespace: " + de.namespace);
System.out.println("Title: " + de.title);
System.out.println("URL: " + de.url);
System.out.println("guessed domain: " + guessDomainName(f.getName()));
String source = getSource(r);
System.out.println("guessed Source: " + source);
System.out.println("guessed main article: " + guessURL(source, de));
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

View File

@ -106,7 +106,7 @@ public class ZIMFile extends File {
break;
}
String mimeType = mimeBuffer.toString();
System.out.println(mimeType);
//System.out.println(mimeType);
mList.add(mimeType);
}
this.mimeTypeList = mList.toArray(new String[mList.size()]);

View File

@ -20,6 +20,7 @@ package org.openzim;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.tukaani.xz.SingleXZInputStream;
import com.github.luben.zstd.ZstdInputStream;
@ -39,6 +40,11 @@ import com.github.luben.zstd.ZstdInputStream;
*/
public class ZIMReader {
public final static String[] METADATA_KEYS = new String[] {
"Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
"Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
};
private final ZIMFile mFile;
public static abstract class DirectoryEntry {
@ -48,13 +54,13 @@ public class ZIMReader {
public final int cluster_number;
public final String url;
public final String title;
public final long urlListindex;
public final int urlListindex;
public DirectoryEntry(
final int mimeType, final char namespace,
final int cluster_number,
final String url, final String title,
final long index) {
final int index) {
this.mimetype = mimeType;
this.namespace = namespace;
this.cluster_number = cluster_number;
@ -74,7 +80,7 @@ public class ZIMReader {
final int mimeType, final char namespace,
final int cluster_number, final int blob_number,
final String url, final String title,
final long urlListindex) {
final int urlListindex) {
super(mimeType, namespace, cluster_number, url, title, urlListindex);
this.cluster_number = cluster_number;
this.blob_number = blob_number;
@ -84,11 +90,11 @@ public class ZIMReader {
public static class RedirectEntry extends DirectoryEntry {
public final long redirect_index;
public final int redirect_index;
public RedirectEntry(final int mimeType, final char namespace,
final long redirect_index, final String url, final String title,
final long urlListindex) {
final int redirect_index, final String url, final String title,
final int urlListindex) {
super(mimeType, namespace, 0, url, title, urlListindex);
this.redirect_index = redirect_index;
}
@ -103,6 +109,25 @@ public class ZIMReader {
return this.mFile;
}
public final String getMetadata(String key) throws IOException {
DirectoryEntry de = getDirectoryInfo('M', key);
if (de == null) return null; // metadata not found; that would be normal
byte[] val = getArticleData(de);
if (val == null) return null; // article data not found: that is not normal
if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client
return new String(val, StandardCharsets.UTF_8);
}
public DirectoryEntry getMainDirectoryEntry() throws IOException {
DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
// resolve redirect to get the actual main page
int redirect = ((RedirectEntry) de).redirect_index;
de = getDirectoryInfo(redirect);
}
return de;
}
public String getURLByURLOrder(final int entryNumber) throws IOException {
// The position of URL i
@ -283,6 +308,7 @@ public class ZIMReader {
is.read(buffer);
long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
long blob_size = offset2 - offset1;
if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!)
byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
// we must do two skip steps: first to the end of the offset list and second to the start of the blob
// - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset