added a cluster cache but it requires more testing

This commit is contained in:
Michael Peter Christen 2023-11-01 19:52:44 +01:00
parent 53b01dbf2e
commit 54fa5d3c2e
2 changed files with 104 additions and 10 deletions

View File

@ -47,9 +47,9 @@ public class ZIMFile extends File {
public final int header_minorVersion;
public final int header_entryCount;
public final int header_clusterCount;
public final long header_urlPtrPos;
public final long header_titlePtrPos;
public final long header_clusterPtrPos;
private final long header_urlPtrPos;
private final long header_titlePtrPos;
private final long header_clusterPtrPos;
public final long header_mimeListPos;
public final int header_mainPage;
public final int header_layoutPage;

View File

@ -38,20 +38,24 @@ import com.github.luben.zstd.ZstdInputStream;
* Proof-Reading, unclustering, refactoring,
* naming adoption to https://wiki.openzim.org/wiki/ZIM_file_format,
* change of Exception handling,
* extension to more attributes as defined in spec (bugfix for mime type loading)
* extension to more attributes as defined in spec (bugfix for mime type loading),
* bugfix to long parsing (prevented reading of large files),
* added extended cluster size parsing
* added ZStandard compression parsing (cluster type 5)
* added cluster index
* added extended cluster size parsing,
* added ZStandard compression parsing (cluster type 5),
* added cluster index and cluster iteration for efficient blob extraction
*/
public class ZIMReader {
private final static int MAX_CLUSTER_CACHE_SIZE = 10;
public final static String[] METADATA_KEYS = new String[] {
"Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
"Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
};
private final ZIMFile mFile;
private List<ArticleEntry> allArticlesCache = null;
private Map<Integer, Map<Integer, ArticleEntry>> indexedArticlesCache = null;
private final ArrayList<Cluster> clusterCache = new ArrayList<>();
public class DirectoryEntry {
@ -132,15 +136,18 @@ public class ZIMReader {
}
public List<ArticleEntry> getAllArticles() throws IOException {
if (this.allArticlesCache != null) return allArticlesCache;
List<ArticleEntry> list = new ArrayList<>();
for (int i = 0; i < this.mFile.header_entryCount; i++) {
DirectoryEntry de = getDirectoryInfo(i);
if (de instanceof ArticleEntry) list.add((ArticleEntry) de);
}
this.allArticlesCache = list;
return list;
}
public Map<Integer, Map<Integer, ArticleEntry>> getIndexedArticles(List<ArticleEntry> list) {
if (this.indexedArticlesCache != null) return indexedArticlesCache;
Map<Integer, Map<Integer, ArticleEntry>> index = new HashMap<>();
for (ArticleEntry entry: list) {
Map<Integer, ArticleEntry> cluster = index.get(entry.cluster_number);
@ -150,9 +157,23 @@ public class ZIMReader {
}
cluster.put(entry.blob_number, entry);
}
this.indexedArticlesCache = index;
return index;
}
/**
* A cluster iterator is the most efficient way to read all documents.
* Because iteration over the documents will cause that clusters are
* decompressed many times (as much as documents are in the cluster)
* it makes more sense to iterate over the clusters and not over the
* documents. That requires that we maintain an index of document entries
* which can be used to find out which documents are actually contained
* in a cluster. Reading of all document entries at first will create some
* waiting time at the beginning of the iteration, but this is not a on-top
* computing time, just concentrated for once at the beginning of all
* document fetch times. If the zim file is very large, this requires
* some extra RAM to cache the indexed document entries.
*/
public class ClusterIterator implements Iterator<ArticleBlobEntry> {
private Map<Integer, Map<Integer, ArticleEntry>> index;
@ -191,7 +212,7 @@ public class ZIMReader {
Map<Integer, ArticleEntry> clusterMap = this.index.get(this.clusterCounter);
ArticleEntry ae = clusterMap.get(this.blobCounter);
loadCluster(); // ensure cluster is loaded
ArticleBlobEntry abe = new ArticleBlobEntry(ae, this.cluster.blobs.get(this.blobCounter));
ArticleBlobEntry abe = new ArticleBlobEntry(ae, this.cluster.getBlob(this.blobCounter));
// increase the counter(s)
this.blobCounter++;
@ -313,6 +334,35 @@ public class ZIMReader {
return null;
}
public Cluster getCluster(int clusterNumber) throws IOException {
for (int i = 0; i < this.clusterCache.size(); i++) {
Cluster c = clusterCache.get(i);
if (c.cluster_number == clusterNumber) {
c.incUsage(); // cache hit
return c;
}
}
// cache miss
Cluster c = new Cluster(clusterNumber);
// check cache size
if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) {
// remove one entry
double minEntry = Double.MAX_VALUE;
int pos = -1;
for (int i = 0; i < clusterCache.size(); i++) {
double r = this.clusterCache.get(i).getUsageRatio();
if (r < minEntry) {minEntry = r; pos = i;}
}
if (pos >= 0) this.clusterCache.remove(pos);
}
c.incUsage();
this.clusterCache.add(c);
return c;
}
/**
* Cluster class is required to read a whole cluster with all documents inside at once.
* This is a good thing because reading single documents from a cluster requires that the
@ -324,10 +374,14 @@ public class ZIMReader {
*/
private class Cluster {
private int cluster_number; // used to identify the correct cache entry
private List<byte[]> blobs;
private int usageCounter; // used for efficient caching and cache stale detection
private boolean extended;
public Cluster(int cluster_number) throws IOException {
this.cluster_number = cluster_number;
this.usageCounter = 0;
// open the cluster and make a Input Stream with the proper decompression type
final long clusterPos = mFile.geClusterPtr(cluster_number);
@ -357,6 +411,7 @@ public class ZIMReader {
offsets.add(end_offset);
int offset_count = (int) ((end_offset - 1) / (extended ? 8 : 4));
for (int i = 0; i < offset_count - 1; i++) {
is.read(buffer);
long l = extended ? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
offsets.add(l);
}
@ -365,14 +420,54 @@ public class ZIMReader {
// the seek position should be now at the beginning of the first document
this.blobs = new ArrayList<>();
for (int i = 0; i < offsets.size() - 1; i++) { // loop until the size - 1 because the last offset is the end of the last document
int length = (int) (offsets.get(i + 1) + offsets.get(i)); // yes the maximum document length is 2GB, for now
int length = (int) (offsets.get(i + 1) - offsets.get(i)); // yes the maximum document length is 2GB, for now
byte[] b = new byte[length];
RandomAccessFileZIMInputStream.readFully(is, b);
this.blobs.add(b);
}
}
public byte[] getBlob(int i) {
return this.blobs.get(i);
}
public void incUsage() {
this.usageCounter++;
}
public int getUsage() {
return this.usageCounter;
}
public int getSize() {
return this.blobs.size();
}
public double getUsageRatio() {
return ((double) this.usageCounter) / ((double) this.blobs.size());
}
}
/*
public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// fail fast
if (directoryInfo == null) return null;
if (directoryInfo.getClass() != ArticleEntry.class) return null;
// This is now an article, so thus we can cast to ArticleEntry
final ArticleEntry article = (ArticleEntry) directoryInfo;
// Read the cluster
Cluster c = getCluster(article.cluster_number);
// read the blob
byte[] blob = c.getBlob(article.blob_number);
return blob;
}
*/
public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {
// fail fast
@ -461,5 +556,4 @@ public class ZIMReader {
return entry;
}
}