diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 69aed1e2a..a9956196e 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -230,12 +230,7 @@ public class Balancer { } public synchronized boolean has(String urlhash) { - try { - return urlFileIndex.has(urlhash.getBytes()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } + return urlFileIndex.has(urlhash.getBytes()); } public boolean notEmpty() { @@ -345,7 +340,7 @@ public class Balancer { urlFileIndex.put(entry.toRow()); // check size of domainStacks and flush - if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) { + if ((domainStacks.size() > 100) || (sizeDomainStacks() > 1000)) { flushOnceDomStacks(1, urlRAMStack.size() < 100); // when the ram stack is small, flush it there } } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index ba2ea7c49..fa0a4c05d 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -238,6 +238,11 @@ public final class CrawlStacker extends Thread { int currentdepth, CrawlProfile.entry profile) { if (profile == null) return; + + // check first before we create a big object + if (this.urlEntryCache.has(nexturl.hash().getBytes())) return; + + // now create the big object before we enter the synchronized block CrawlEntry newEntry = new CrawlEntry( initiatorHash, nexturl, @@ -249,15 +254,15 @@ public final class CrawlStacker extends Thread { 0, 0 ); - if (newEntry == null) return; + kelondroRow.Entry newEntryRow = newEntry.toRow(); - synchronized(this.urlEntryHashCache) { + synchronized(this.urlEntryHashCache) { kelondroRow.Entry oldValue; boolean hostknown = true; if (prequeue) hostknown = prefetchHost(nexturl.getHost()); try { - oldValue = this.urlEntryCache.put(newEntry.toRow()); + oldValue = this.urlEntryCache.put(newEntryRow); } catch (IOException e) { oldValue = null; } @@ -346,7 +351,7 @@ public final class CrawlStacker extends Thread { synchronized (this.urlEntryHashCache) { urlHash = this.urlEntryHashCache.removeFirst(); if (urlHash == null) throw new IOException("urlHash is null"); - entry = this.urlEntryCache.remove(urlHash.getBytes(), false); + entry = this.urlEntryCache.remove(urlHash.getBytes(), true); } if ((urlHash == null) || (entry == null)) return null; diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 45bec1286..7072ff126 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -90,15 +90,6 @@ public final class ResultURLs { assert executorHash != null; if (e == null) { return; } try { -// switch (stackType) { -// case 0: break; -// case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break; -// case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break; -// case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break; -// case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break; -// case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; -// case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; -// } final List resultStack = getStack(stackType); if(resultStack != null) { resultStack.add(e.hash() + initiatorHash + executorHash); @@ -121,54 +112,18 @@ public final class ResultURLs { } else { return -1; } -// switch (stack) { -// case 1: return externResultStack.size(); -// case 2: return searchResultStack.size(); -// case 3: return transfResultStack.size(); -// case 4: return proxyResultStack.size(); -// case 5: return lcrawlResultStack.size(); -// case 6: return gcrawlResultStack.size(); -// } -// return -1; } public synchronized String getUrlHash(int stack, int pos) { return getHashNo(stack, pos, 0); -// switch (stack) { -// case 1: return (externResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// case 2: return (searchResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// case 3: return (transfResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// case 4: return (proxyResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// case 5: return (lcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// case 6: return (gcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); -// } -// return null; } public synchronized String getInitiatorHash(int stack, int pos) { return getHashNo(stack, pos, 1); -// switch (stack) { -// case 1: return (externResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// case 2: return (searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// case 3: return (transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// case 4: return (proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// case 5: return (lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// case 6: return (gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); -// } -// return null; } public synchronized String getExecutorHash(final int stack, int pos) { return getHashNo(stack, pos, 2); -// switch (stack) { -// case 1: return (externResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// case 2: return (searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// case 3: return (transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// case 4: return (proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// case 5: return (lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// case 6: return (gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); -// } -// return null; } /** @@ -221,6 +176,7 @@ public final class ResultURLs { final List resultStack = getStack(stack); if(resultStack != null) { + assert pos < resultStack.size() : "pos = " + pos + ", resultStack.size() = " + resultStack.size(); if(pos < resultStack.size()) { return resultStack.get(pos); } else { diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 1821fa34d..41303cc39 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -141,11 +141,7 @@ public class ZURL { } public boolean exists(String urlHash) { - try { - return urlIndex.has(urlHash.getBytes()); - } catch (IOException e) { - return false; - } + return urlIndex.has(urlHash.getBytes()); } public void clearStack() { diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 8a56dc598..5ee381aeb 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -28,6 +28,7 @@ package de.anomic.index; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Set; import de.anomic.kelondro.kelondroCloneableIterator; @@ -159,7 +160,7 @@ public final class indexRAMRI implements indexRI, indexRIReader { return null; } - public synchronized String bestFlushWordHash() { + private String bestFlushWordHash() { // select appropriate hash // we have 2 different methods to find a good hash: // - the oldest entry in the cache @@ -189,7 +190,8 @@ public final class indexRAMRI implements indexRI, indexRIReader { hash = hashDate.getMinObject(); // flush oldest entries } if (hash == null) { - heap.wordContainers(null, false).next(); + indexContainer ic = heap.wordContainers(null, false).next(); + if (ic != null) hash = ic.getWordHash(); } return hash; } catch (Exception e) { @@ -198,6 +200,23 @@ public final class indexRAMRI implements indexRI, indexRIReader { return null; } + public synchronized ArrayList bestFlushContainers(int count) { + ArrayList containerList = new ArrayList(); + String hash; + indexContainer container; + for (int i = 0; i < count; i++) { + hash = bestFlushWordHash(); + if (hash == null) return containerList; + container = heap.delete(hash); + assert (container != null); + if (container == null) return containerList; + hashScore.deleteScore(hash); + hashDate.deleteScore(hash); + containerList.add(container); + } + return containerList; + } + private int intTime(long longTime) { return (int) Math.max(0, ((longTime - initTime) / 1000)); } diff --git a/source/de/anomic/index/indexRepositoryReference.java b/source/de/anomic/index/indexRepositoryReference.java index 3d74aa340..2b56e0dbf 100644 --- a/source/de/anomic/index/indexRepositoryReference.java +++ b/source/de/anomic/index/indexRepositoryReference.java @@ -151,11 +151,7 @@ public final class indexRepositoryReference { public synchronized boolean exists(String urlHash) { if (urlIndexFile == null) return false; // case may happen during shutdown - try { - return urlIndexFile.has(urlHash.getBytes()); - } catch (IOException e) { - return false; - } + return urlIndexFile.has(urlHash.getBytes()); } public kelondroCloneableIterator entries(boolean up, String firstHash) throws IOException { diff --git a/source/de/anomic/kelondro/kelondroBytesIntMap.java b/source/de/anomic/kelondro/kelondroBytesIntMap.java index d53d2b4db..0c9e28f3d 100644 --- a/source/de/anomic/kelondro/kelondroBytesIntMap.java +++ b/source/de/anomic/kelondro/kelondroBytesIntMap.java @@ -49,6 +49,11 @@ public class kelondroBytesIntMap { return index.row(); } + public synchronized boolean has(byte[] key) { + assert (key != null); + return index.has(key); + } + public synchronized int geti(byte[] key) throws IOException { assert (key != null); kelondroRow.Entry indexentry = index.get(key); diff --git a/source/de/anomic/kelondro/kelondroCache.java b/source/de/anomic/kelondro/kelondroCache.java index ca8a51a20..6b2ef94fb 100644 --- a/source/de/anomic/kelondro/kelondroCache.java +++ b/source/de/anomic/kelondro/kelondroCache.java @@ -191,7 +191,7 @@ public class kelondroCache implements kelondroIndex { readMissCache = null; } - public boolean has(byte[] key) throws IOException { + public boolean has(byte[] key) { // first look into the miss cache if (readMissCache != null) { if (readMissCache.get(key) != null) { diff --git a/source/de/anomic/kelondro/kelondroEcoTable.java b/source/de/anomic/kelondro/kelondroEcoTable.java index c23bd4f60..cd7374e05 100644 --- a/source/de/anomic/kelondro/kelondroEcoTable.java +++ b/source/de/anomic/kelondro/kelondroEcoTable.java @@ -345,10 +345,15 @@ public class kelondroEcoTable implements kelondroIndex { return rowdef.newEntry(b); } - public synchronized boolean has(byte[] key) throws IOException { - assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size(); + public synchronized boolean has(byte[] key) { + try { + assert file.size() == index.size() + fail : "file.size() = " + file.size() + ", index.size() = " + index.size(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } assert ((table == null) || (table.size() == index.size())); - return index.geti(key) >= 0; + return index.has(key); } public synchronized kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException { diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java index f949dd958..53ad9d18e 100644 --- a/source/de/anomic/kelondro/kelondroFlexTable.java +++ b/source/de/anomic/kelondro/kelondroFlexTable.java @@ -147,12 +147,12 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr return RAMIndex; } - public synchronized boolean has(byte[] key) throws IOException { + public synchronized boolean has(byte[] key) { // it is not recommended to implement or use a has predicate unless // it can be ensured that it causes no IO if ((kelondroAbstractRecords.debugmode) && (RAMIndex != true)) serverLog.logWarning("kelondroFlexTable", "RAM index warning in file " + super.tablename); assert this.size() == index.size() : "content.size() = " + this.size() + ", index.size() = " + index.size(); - return index.geti(key) >= 0; + return index.has(key); } private kelondroBytesIntMap initializeRamIndex(int initialSpace) { diff --git a/source/de/anomic/kelondro/kelondroIndex.java b/source/de/anomic/kelondro/kelondroIndex.java index 4dae69e41..2c04d975f 100644 --- a/source/de/anomic/kelondro/kelondroIndex.java +++ b/source/de/anomic/kelondro/kelondroIndex.java @@ -61,7 +61,7 @@ public interface kelondroIndex { public int size(); public kelondroProfile profile(); public kelondroRow row(); - public boolean has(byte[] key) throws IOException; // use this only if there is no get in case that has returns true + public boolean has(byte[] key); // use this only if there is no get in case that has returns true public kelondroRow.Entry get(byte[] key) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException; diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 17b8b4617..46efd9303 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -109,7 +109,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } public synchronized boolean has(byte[] key) { - return (get(key) != null); + long handle = profile.startRead(); + int index = find(key, 0, key.length); + profile.stopRead(handle); + return index >= 0; } public synchronized kelondroRow.Entry get(byte[] key) { diff --git a/source/de/anomic/kelondro/kelondroSQLTable.java b/source/de/anomic/kelondro/kelondroSQLTable.java index 7559c086c..be0c537a2 100644 --- a/source/de/anomic/kelondro/kelondroSQLTable.java +++ b/source/de/anomic/kelondro/kelondroSQLTable.java @@ -135,8 +135,12 @@ public class kelondroSQLTable implements kelondroIndex { return this.rowdef; } - public boolean has(byte[] key) throws IOException { - return (get(key) != null); + public boolean has(byte[] key) { + try { + return (get(key) != null); + } catch (IOException e) { + return false; + } } public ArrayList removeDoubles() { diff --git a/source/de/anomic/kelondro/kelondroSplitTable.java b/source/de/anomic/kelondro/kelondroSplitTable.java index dbd4a25c9..72d5c33a3 100644 --- a/source/de/anomic/kelondro/kelondroSplitTable.java +++ b/source/de/anomic/kelondro/kelondroSplitTable.java @@ -139,7 +139,7 @@ public class kelondroSplitTable implements kelondroIndex { if (f.isDirectory()) { // this is a kelonodroFlex table serverLog.logInfo("kelondroSplitTable", "opening partial flex table " + path); - table = new kelondroCache(new kelondroFlexTable(path, maxf, rowdef, 0, resetOnFail)); + table = new kelondroFlexTable(path, maxf, rowdef, 0, resetOnFail); } else { serverLog.logInfo("kelondroSplitTable", "opening partial eco table " + f); table = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); @@ -209,7 +209,7 @@ public class kelondroSplitTable implements kelondroIndex { return this.rowdef; } - public boolean has(byte[] key) throws IOException { + public boolean has(byte[] key) { return keeperOf(key) != null; } @@ -276,19 +276,13 @@ public class kelondroSplitTable implements kelondroIndex { try { cs.submit(new Callable() { public kelondroIndex call() { - try { - if (table.has(key)) return table; else return dummyIndex; - } catch (IOException e) { - return dummyIndex; - } + if (table.has(key)) return table; else return dummyIndex; } }); } catch (RejectedExecutionException e) { // the executor is either shutting down or the blocking queue is full // execute the search direct here without concurrency - try { - if (table.has(key)) return table; - } catch (IOException ee) {} + if (table.has(key)) return table; rejected++; } } diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index 3d69bef0e..84607011e 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -170,7 +170,7 @@ public class kelondroTree extends kelondroCachedRecords implements kelondroIndex n.commit(); } - public boolean has(byte[] key) throws IOException { + public boolean has(byte[] key) { throw new UnsupportedOperationException("has should not be used with kelondroTree."); } diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 6f9f7cff0..6f2f7d284 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -358,7 +358,7 @@ public final class plasmaCondenser { k = it.next(); wsp = words.get(k); wsp.check(idx); - words.put(k, wsp); + words.put(k, wsp); // is that necessary? } } sentence = new StringBuffer(100); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index a8e3e3595..7941910ae 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -502,19 +502,8 @@ public final class plasmaWordIndex implements indexRI { } } count = count - containerList.size(); - for (int i = 0; i < count; i++) { // possible position of outOfMemoryError ? - synchronized (ram) { - if (ram.size() == 0) break; - if (serverMemory.available() < collections.minMem()) break; // protect memory during flush - - // select one word to flush - wordHash = ram.bestFlushWordHash(); - - // move one container from ram to flush list - if (wordHash == null) c = null; else c = ram.deleteContainer(wordHash); - } - if (c != null) containerList.add(c); - } + containerList.addAll(ram.bestFlushContainers(count)); + // flush the containers for (indexContainer container : containerList) collections.addEntries(container); //System.out.println("DEBUG-Finished flush of " + count + " entries from RAM to DB in " + (System.currentTimeMillis() - start) + " milliseconds");