increased default maximum file size for database files to 2GB

Other file sizes can now be configured with the attributes filesize.max.win and filesize.max.other the default maximum file size for non-windows OS is now 32GB git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5974 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-05-25 06:59:21 +00:00 · 2009-05-25 06:59:21 +00:00 · 26a46b5521
commit 26a46b5521
parent eb36c9a092
8 changed files with 36 additions and 18 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -73,6 +73,12 @@ server.maxTrackingCount = 1000
 # maximum number of hosts that are tracked
 server.maxTrackingHostCount = 100

+# maximum file sizes: since some users experience problems with too large files
+# the file size of database files can be limited. Larger files can be used to get a
+# better IO performance and to use less RAM; however, if the size must be limited
+# because of limitations of the file system, the maximum size can be set here
+filesize.max.win   =  2147483647
+filesize.max.other = 34359738367

 # Network Definition
 # There can be separate YaCy networks, and managed sub-groups of the general network.
--- a/source/de/anomic/data/URLAnalysis.java
+++ b/source/de/anomic/data/URLAnalysis.java
@ -483,7 +483,7 @@ public class URLAnalysis {
        } else if (args[0].equals("-diffurlcol") && args.length >= 3) {
            // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
            // example:
-            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump
+            // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
            try {
                diffurlcol(args[1], args[2], args[3]);
            } catch (IOException e) {
--- a/source/de/anomic/kelondro/blob/BLOBArray.java
+++ b/source/de/anomic/kelondro/blob/BLOBArray.java
@ -66,7 +66,6 @@ public class BLOBArray implements BLOB {
     */
    
    public static final long oneMonth    = 1000L * 60L * 60L * 24L * 365L / 12L;
-    public static final long oneGigabyte = 1024L * 1024L * 1024L;
    
    private int keylength;
    private ByteOrder ordering;
@ -91,7 +90,7 @@ public class BLOBArray implements BLOB {
        this.buffersize = buffersize;
        this.heapLocation = heapLocation;
        this.fileAgeLimit = oneMonth;
-        this.fileSizeLimit = oneGigabyte;
+        this.fileSizeLimit = (long) Integer.MAX_VALUE;
        this.repositoryAgeMax = Long.MAX_VALUE;
        this.repositorySizeMax = Long.MAX_VALUE;

@ -327,7 +326,7 @@ public class BLOBArray implements BLOB {
    
    public void setMaxSize(long maxSize) {
        this.repositorySizeMax = maxSize;
-        this.fileSizeLimit = Math.min(oneGigabyte, maxSize / 10);
+        this.fileSizeLimit = Math.min((long) Integer.MAX_VALUE, maxSize / 10L);
    }
    
    private void executeLimits() {
--- a/source/de/anomic/kelondro/table/SplitTable.java
+++ b/source/de/anomic/kelondro/table/SplitTable.java
@ -90,7 +90,7 @@ public class SplitTable implements ObjectIndex {
            final String tablename, 
            final Row rowdef,
            final boolean resetOnFail) {
-        this(path, tablename, rowdef, BLOBArray.oneMonth, BLOBArray.oneGigabyte, resetOnFail);
+        this(path, tablename, rowdef, BLOBArray.oneMonth, (long) Integer.MAX_VALUE, resetOnFail);
    }

    public SplitTable(
--- a/source/de/anomic/kelondro/text/MetadataRepository.java
+++ b/source/de/anomic/kelondro/text/MetadataRepository.java
@ -68,7 +68,6 @@ public final class MetadataRepository implements Iterable<byte[]> {
        this.urlIndexFile = new Cache(new SplitTable(this.location, "urls", URLMetadataRow.rowdef, false));
        this.exportthread = null; // will have a export thread assigned if exporter is running
        this.statsDump = null;
-       
    }

    public void clearCache() {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -170,6 +170,7 @@ import de.anomic.server.serverProcessorJob;
 import de.anomic.server.serverProfiling;
 import de.anomic.server.serverSemaphore;
 import de.anomic.server.serverSwitch;
+import de.anomic.server.serverSystem;
 import de.anomic.server.serverThread;
 import de.anomic.tools.crypt;
 import de.anomic.tools.CryptoLib;
@ -319,8 +320,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        // start indexing management
        log.logConfig("Starting Indexing Management");
        final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
-        final boolean useCommons = getConfigBool("index.storeCommons", false);
-        final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);        
+        final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
+        final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
        final int paritionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
        try {
 			webIndex = new plasmaWordIndex(
@ -329,7 +330,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
 			        indexPrimaryPath,
 			        indexSecondaryPath,
 			        wordCacheMaxCount,
-			        useCommons,
+			        fileSizeMax,
 			        redundancy,
 			        paritionExponent);
 		} catch (IOException e1) {
@ -795,7 +796,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
            final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
            final File indexSecondaryPath = (getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, ""));
            final int wordCacheMaxCount = (int) getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
-            final boolean useCommons = getConfigBool("index.storeCommons", false);
+            final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
            final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
            final int paritionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
            try {
@ -805,7 +806,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
 				        indexPrimaryPath,
 				        indexSecondaryPath,
 				        wordCacheMaxCount,
-				        useCommons,
+				        fileSizeMax,
 				        redundancy,
 				        paritionExponent);
 			} catch (IOException e) {
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -41,7 +41,6 @@ import de.anomic.crawler.IndexingStack;
 import de.anomic.data.Blacklist;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.http.httpdProxyCacheEntry;
-import de.anomic.kelondro.blob.BLOBArray;
 import de.anomic.kelondro.order.Base64Order;
 import de.anomic.kelondro.order.ByteOrder;
 import de.anomic.kelondro.text.BufferedIndex;
@ -71,9 +70,7 @@ public final class plasmaWordIndex {
    public static final long wCacheMaxAge    = 1000 * 60 * 30; // milliseconds; 30 minutes
    public static final int  wCacheMaxChunk  =  800;           // maximum number of references for each urlhash
    public static final int  lowcachedivisor =  900;
-    public static final int  maxCollectionPartition = 7;       // should be 7
-    public static final long targetFileSize  = 100 * 1024 * 1024; // 100 MB
-    public static final long maxFileSize     = BLOBArray.oneGigabyte; // 1GB
+    public static final long targetFileSize  = 256 * 1024 * 1024; // 256 MB
    public static final int  writeBufferSize = 4 * 1024 * 1024;
    
    // the reference factory
@ -120,7 +117,7 @@ public final class plasmaWordIndex {
            final File indexPrimaryRoot,
            final File indexSecondaryRoot,
            final int entityCacheMaxSize,
-            final boolean useCommons,
+            final long maxFileSize,
            final int redundancy,
            final int partitionExponent) throws IOException {
        
--- a/source/yacy.java
+++ b/source/yacy.java
@ -676,7 +676,15 @@ public final class yacy {
            final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total());
            if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
                
-            final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
+            final plasmaWordIndex wordIndex = new plasmaWordIndex(
+                    networkName,
+                    log,
+                    indexPrimaryRoot,
+                    indexSecondaryRoot,
+                    10000,
+                    (long) Integer.MAX_VALUE,
+                    0,
+                    0);
            final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.index().references("AAAAAAAAAAAA".getBytes(), false, false);
            
            long urlCounter = 0, wordCounter = 0;
@ -867,7 +875,15 @@ public final class yacy {
        try {
            Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
            if (resource.equals("all")) {
-                WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
+                WordIndex = new plasmaWordIndex(
+                        "freeworld",
+                        log, 
+                        indexPrimaryRoot, 
+                        indexSecondaryRoot,
+                        10000,
+                        (long) Integer.MAX_VALUE,
+                        1,
+                        0);
                indexContainerIterator = WordIndex.index().references(wordChunkStartHash.getBytes(), false, false);
            }
            int counter = 0;