yacy_search_server/source/de/anomic/plasma/plasmaCrawlProfile.java
orbiter 66964dc015 removed high/med/low from kelondroRecords cache control.
this was done because testing showed that cache-delete operations
slowed down record access most, even more that actual IO operations.
Cache-delete operations appeared when entries were shifted from low-priority
positions to high-priority positions. During a fill of x entries to a database,
x/2 delete situation happen which caused two or more delete operations.
removing the cache control means that these delete operations are not
necessary any more, but it is more difficult to decide which cache elements
shall be removed in case that the cache is full. There is not yet a stable
solution for this case, but the advantage of a faster cache is more important
that the flush problem.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2244 6c8d7289-2bf4-0310-a012-ef5d649a1542
2006-06-25 10:31:38 +00:00

474 lines
19 KiB
Java

// plasmaCrawlProfile.java
// ------------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 25.02.2004
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMap;
import de.anomic.server.serverCodings;
public class plasmaCrawlProfile {
private kelondroMap profileTable;
private HashMap domsCache;
private File profileTableFile;
private int bufferkb;
public plasmaCrawlProfile(File file, int bufferkb) {
this.profileTableFile = file;
kelondroDyn dyn = null;
if (profileTableFile.exists()) try {
dyn = new kelondroDyn(file, bufferkb * 1024, '#');
} catch (IOException e) {
profileTableFile.delete();
dyn = new kelondroDyn(file, bufferkb * 1024, indexURL.urlCrawlProfileHandleLength, 2000, '#', true);
} else {
profileTableFile.getParentFile().mkdirs();
dyn = new kelondroDyn(file, bufferkb * 1024, indexURL.urlCrawlProfileHandleLength, 2000, '#', true);
}
profileTable = new kelondroMap(dyn);
domsCache = new HashMap();
}
public int dbCacheNodeChunkSize() {
return profileTable.cacheNodeChunkSize();
}
public int[] dbCacheNodeStatus() {
return profileTable.cacheNodeStatus();
}
public String[] dbCacheObjectStatus() {
return profileTable.cacheObjectStatus();
}
private void resetDatabase() {
// deletes the profile database and creates a new one
if (profileTable != null) try { profileTable.close(); } catch (IOException e) {}
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs();
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024, indexURL.urlCrawlProfileHandleLength, 2000, '#', true));
}
public void close() {
try {
profileTable.close();
} catch (IOException e) {}
}
public int size() {
return profileTable.size();
}
public Iterator profiles(boolean up) {
// enumerates profile entries
try {
return new profileIterator(up);
} catch (IOException e) {
return new HashSet().iterator();
}
}
public class profileIterator implements Iterator {
// the iterator iterates all keys, which are byte[] objects
kelondroDyn.dynKeyIterator handleIterator;
String lastkey;
public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
lastkey = null;
}
public boolean hasNext() {
try {
return handleIterator.hasNext();
} catch (kelondroException e) {
resetDatabase();
return false;
}
}
public Object next() {
try {
lastkey = (String) handleIterator.next();
return getEntry(lastkey);
} catch (kelondroException e) {
resetDatabase();
return null;
}
}
public void remove() {
if (lastkey != null) try {
removeEntry(lastkey);
} catch (kelondroException e) {
resetDatabase();
}
}
}
public void removeEntry(String handle) {
try {
profileTable.remove(handle);
} catch (IOException e) {}
}
public entry newEntry(Map mem) {
entry ne = new entry(mem);
try {
profileTable.set(ne.handle(), ne.map());
} catch (kelondroException e) {
resetDatabase();
try {
profileTable.set(ne.handle(), ne.map());
} catch (IOException ee) {
e.printStackTrace();
System.exit(0);
}
} catch (IOException e) {
resetDatabase();
try {
profileTable.set(ne.handle(), ne.map());
} catch (IOException ee) {
e.printStackTrace();
System.exit(0);
}
}
return ne;
}
public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
xsstopw, xdstopw, xpstopw);
try {
profileTable.set(ne.handle(), ne.map());
} catch (kelondroException e) {
resetDatabase();
try {
profileTable.set(ne.handle(), ne.map());
} catch (IOException ee) {
e.printStackTrace();
System.exit(0);
}
} catch (IOException e) {
resetDatabase();
try {
profileTable.set(ne.handle(), ne.map());
} catch (IOException ee) {
e.printStackTrace();
System.exit(0);
}
}
return ne;
}
public entry getEntry(String handle) {
try {
Map m = profileTable.get(handle);
if (m == null) return null;
return new entry(m);
} catch (IOException e) {
return null;
}
}
public class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(String ref, int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public class entry {
// this is a simple record structure that hold all properties of a single crawl start
private Map mem;
private Map doms;
public entry(String name, String startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexURL.urlCrawlProfileHandleLength);
mem = new HashMap();
mem.put("handle", handle);
mem.put("name", name);
mem.put("startURL", startURL);
mem.put("generalFilter", generalFilter);
mem.put("specificFilter", specificFilter);
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
mem.put("localIndexing", (localIndexing) ? "true" : "false");
mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
doms = new HashMap();
}
public String toString() {
StringBuffer str = new StringBuffer();
if (this.mem != null) {
str.append(this.mem.toString());
}
return str.toString();
}
public entry(Map mem) {
this.mem = mem;
this.doms = (HashMap) domsCache.get(this.mem.get("handle"));
if (this.doms == null) this.doms = new HashMap();
}
public Map map() {
return mem;
}
public String handle() {
String r = (String) mem.get("handle");
if (r == null) return null; else return r;
}
public String name() {
String r = (String) mem.get("name");
if (r == null) return ""; else return r;
}
public String startURL() {
String r = (String) mem.get("startURL");
if (r == null) return null; else return r;
}
public String generalFilter() {
String r = (String) mem.get("generalFilter");
if (r == null) return ".*"; else return r;
}
public String specificFilter() {
String r = (String) mem.get("specificFilter");
if (r == null) return ".*"; else return r;
}
public int generalDepth() {
String r = (String) mem.get("generalDepth");
if (r == null) return 0; else try {
return Integer.parseInt(r);
} catch (NumberFormatException e) {
return 0;
}
}
public int specificDepth() {
String r = (String) mem.get("specificDepth");
if (r == null) return 0; else try {
return Integer.parseInt(r);
} catch (NumberFormatException e) {
return 0;
}
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an antry must have to be re-crawled
String r = (String) mem.get("recrawlIfOlder");
if (r == null) return Long.MAX_VALUE; else try {
long l = Long.parseLong(r) * ((long) 60000);
if (l < 0) return Long.MAX_VALUE; else return l;
} catch (NumberFormatException e) {
return 0;
}
}
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
String r = (String) mem.get("domFilterDepth");
if (r == null) return Integer.MAX_VALUE; else try {
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (NumberFormatException e) {
return Integer.MAX_VALUE;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
String r = (String) mem.get("domMaxPages");
if (r == null) return Integer.MAX_VALUE; else try {
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (NumberFormatException e) {
return Integer.MAX_VALUE;
}
}
public boolean crawlingQ() {
String r = (String) mem.get("crawlingQ");
if (r == null) return false; else return (r.equals("true"));
}
public boolean storeHTCache() {
String r = (String) mem.get("storeHTCache");
if (r == null) return false; else return (r.equals("true"));
}
public boolean storeTXCache() {
String r = (String) mem.get("storeTXCache");
if (r == null) return false; else return (r.equals("true"));
}
public boolean localIndexing() {
String r = (String) mem.get("localIndexing");
if (r == null) return false; else return (r.equals("true"));
}
public boolean remoteIndexing() {
String r = (String) mem.get("remoteIndexing");
if (r == null) return false; else return (r.equals("true"));
}
public boolean excludeStaticStopwords() {
String r = (String) mem.get("xsstopw");
if (r == null) return false; else return (r.equals("true"));
}
public boolean excludeDynamicStopwords() {
String r = (String) mem.get("xdstopw");
if (r == null) return false; else return (r.equals("true"));
}
public boolean excludeParentStopwords() {
String r = (String) mem.get("xpstopw");
if (r == null) return false; else return (r.equals("true"));
}
public void changeEntry(String propName, String newValue) throws IOException {
mem.put(propName, newValue);
profileTable.set(handle(), mem);
}
public void domInc(String domain, String referrer, int depth) {
synchronized (domain.intern()) {
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
// new domain
doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
doms.put(domain, dp);
}
}
domsCache.put(this.mem.get("handle"), doms);
}
public boolean grantedDomAppearance(String domain) {
int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
synchronized (domain.intern()) {
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0 < max;
} else {
return dp.depth <= max;
}
}
}
public boolean grantedDomCount(String domain) {
int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
synchronized (domain.intern()) {
DomProfile dp = (DomProfile) doms.get(domain);
if (dp == null) {
return 0 < max;
} else {
return dp.count <= max;
}
}
}
public int domSize() {
return doms.size();
}
public boolean domExists(String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domNames(boolean attr, int maxlength) {
Iterator domnamesi = doms.entrySet().iterator();
String domnames="";
Map.Entry ey;
DomProfile dp;
while (domnamesi.hasNext()) {
ey = (Map.Entry) domnamesi.next();
dp = (DomProfile) ey.getValue();
domnames += ((String) ey.getKey()) + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count + " ") : " ") + "<br>";
if ((maxlength > 0) && (domnames.length() >= maxlength)) {
domnames = domnames.substring(0, maxlength-3) + "...";
break;
}
}
return domnames;
}
}
}