// plasmaCrawlLURL.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // Using this software in any meaning (reading, learning, copying, compiling, // running) means that you agree that the Author(s) is (are) not responsible // for cost, loss of data or any harm that may be caused directly or indirectly // by usage of this softare or this documentation. The usage of this software // is on your own risk. The installation and usage (starting/running) of this // software may allow other people or application to access your computer and // any attached devices and is highly dependent on the configuration of the // software which must be done by the user of the software; the author(s) is // (are) also not responsible for proper configuration and usage of the // software, even if provoked by documentation provided together with // the software. // // Any changes to this file according to the GPL as documented in the file // gpl.txt aside this file in the shipment you received can be done to the // lines that follows this copyright notice here, but changes must not be // done inside the copyright notive above. A re-distribution must contain // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. /* This class provides storage functions for the plasma search engine. - the url-specific properties, including condenser results - the text content of the url Both entities are accessed with a hash, which is based on the MD5 algorithm. The MD5 is not encoded as a hex value, but a b64 value. */ package de.anomic.plasma; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Locale; import de.anomic.http.httpc; import de.anomic.http.httpc.response; import de.anomic.index.indexEntry; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBufferedIndex; import de.anomic.kelondro.kelondroCachedIndex; import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; import de.anomic.tools.nxTools; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; public final class plasmaCrawlLURL extends indexURL { // result stacks; // these have all entries of form // strings: urlHash + initiatorHash + ExecutorHash private final LinkedList externResultStack; // 1 - remote index: retrieved by other peer private final LinkedList searchResultStack; // 2 - partly remote/local index: result of search queries private final LinkedList transfResultStack; // 3 - partly remote/local index: result of index transfer private final LinkedList proxyResultStack; // 4 - local index: result of proxy fetch/prefetch private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external private boolean newdb; public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime, boolean newdb) { super(); this.newdb = newdb; try { if (newdb) { urlIndexFile = new kelondroBufferedIndex(new kelondroCachedIndex(new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb / 2 * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder), bufferkb / 2 * 0x400)); } else { File oldLURLDB = new File(plasmaPath, "urlHash.db"); oldLURLDB.getParentFile().mkdirs(); urlIndexFile = new kelondroBufferedIndex(new kelondroCachedIndex(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, plasmaCrawlLURLOldEntry.rowdef), bufferkb / 2 * 0x400)); } } catch (IOException e) { e.printStackTrace(); System.exit(-1); } // init result stacks externResultStack = new LinkedList(); searchResultStack = new LinkedList(); transfResultStack = new LinkedList(); proxyResultStack = new LinkedList(); lcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList(); } public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) { if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; } switch (stackType) { case 0: break; case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break; case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break; case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break; case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break; case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; } return; } catch (Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); return; } } public synchronized void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } public synchronized void flushCacheSome() { try { ((kelondroBufferedIndex) urlIndexFile).flushSome(); } catch (IOException e) {} } public synchronized int writeCacheSize() { return ((kelondroBufferedIndex) urlIndexFile).writeBufferSize(); } public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. // we have two options to find the url: // - look into the hash cache // - look into the filed properties // if the url cannot be found, this returns null try { kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; if (newdb) return new plasmaCrawlLURLNewEntry(entry, searchedWord); else return new plasmaCrawlLURLOldEntry(entry, searchedWord); } catch (IOException e) { return null; } } public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException { // Check if there is a more recent Entry already in the DB plasmaCrawlLURLEntry oldEntry; try { if (exists(entry.hash())) { oldEntry = load(entry.hash(), null); } else { oldEntry = null; } } catch (Exception e) { oldEntry = null; } if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // the fetched oldEntry is better, so return its properties instead of the new ones // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same // this.url = oldEntry.url; // unnecessary, should be the same entry = oldEntry; return; // this did not need to be stored, but is updated } urlIndexFile.put(entry.toRowEntry(), entry.loaddate()); } public synchronized plasmaCrawlLURLEntry newEntry(String propStr) { if (propStr.startsWith("{") && propStr.endsWith("}")) { if (newdb) return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); else return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); } else { return null; } } public synchronized plasmaCrawlLURLEntry newEntry( URL url, String descr, String author, String tags, String ETag, Date mod, Date load, Date fresh, String referrer, byte[] md5, long size, int wc, char dt, bitfield flags, String lang, int llocal, int lother, int laudio, int limage, int lvideo, int lapp) { if (newdb) return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); else return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); } public synchronized int getStackSize(int stack) { switch (stack) { case 1: return externResultStack.size(); case 2: return searchResultStack.size(); case 3: return transfResultStack.size(); case 4: return proxyResultStack.size(); case 5: return lcrawlResultStack.size(); case 6: return gcrawlResultStack.size(); } return -1; } public synchronized String getUrlHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength); case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength); case 3: return ((String) transfResultStack.get(pos)).substring(0, urlHashLength); case 4: return ((String) proxyResultStack.get(pos)).substring(0, urlHashLength); case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, urlHashLength); case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, urlHashLength); } return null; } public synchronized String getInitiatorHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); } return null; } public synchronized String getExecutorHash(int stack, int pos) { switch (stack) { case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); } return null; } public synchronized boolean removeStack(int stack, int pos) { Object prevElement = null; switch (stack) { case 1: prevElement = externResultStack.remove(pos); break; case 2: prevElement = searchResultStack.remove(pos); break; case 3: prevElement = transfResultStack.remove(pos); break; case 4: prevElement = proxyResultStack.remove(pos); break; case 5: prevElement = lcrawlResultStack.remove(pos); break; case 6: prevElement = gcrawlResultStack.remove(pos); break; } return prevElement != null; } public synchronized void clearStack(int stack) { switch (stack) { case 1: externResultStack.clear(); break; case 2: searchResultStack.clear(); break; case 3: transfResultStack.clear(); break; case 4: proxyResultStack.clear(); break; case 5: lcrawlResultStack.clear(); break; case 6: gcrawlResultStack.clear(); break; } } public synchronized boolean remove(String urlHash) { if (urlHash == null) return false; try { kelondroRow.Entry r = urlIndexFile.remove(urlHash.getBytes()); if (r == null) return false; for (int stack = 1; stack <= 6; stack++) { for (int i = getStackSize(stack) - 1; i >= 0; i--) { if (getUrlHash(stack, i).equals(urlHash)) { removeStack(stack, i); return true; } } } return true; } catch (IOException e) { return false; } } public synchronized boolean exists(String urlHash) { try { return (urlIndexFile.get(urlHash.getBytes()) != null); } catch (IOException e) { return false; } } private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); private static String daydate(Date date) { if (date == null) { return ""; } else { return dayFormatter.format(date); } } public serverObjects genTableProps(int tabletype, int lines, boolean showInit, boolean showExec, String dfltInit, String dfltExec, String feedbackpage, boolean makeLink) { /* serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps tabletype=" + tabletype + " lines=" + lines + " showInit=" + showInit + " showExec=" + showExec + " dfltInit=" + dfltInit + " dfltExec=" + dfltExec + " feedbackpage=" + feedbackpage + " makeLink=" + makeLink); */ final serverObjects prop = new serverObjects(); if (getStackSize(tabletype) == 0) { prop.put("table", 0); return prop; } prop.put("table", 1); if (lines > getStackSize(tabletype)) lines = getStackSize(tabletype); if (lines == getStackSize(tabletype)) { prop.put("table_size", 0); } else { prop.put("table_size", 1); prop.put("table_size_count", lines); } prop.put("table_size_all", getStackSize(tabletype)); prop.put("table_feedbackpage", feedbackpage); prop.put("table_tabletype", tabletype); prop.put("table_showInit", (showInit) ? 1 : 0); prop.put("table_showExec", (showExec) ? 1 : 0); boolean dark = true; String urlHash, initiatorHash, executorHash; String cachepath, urlstr, urltxt; yacySeed initiatorSeed, executorSeed; plasmaCrawlLURLEntry urle; // needed for getCachePath(url) final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard(); final plasmaHTCache cacheManager = switchboard.getCacheManager(); int i, cnt = 0; for (i = getStackSize(tabletype) - 1; i >= (getStackSize(tabletype) - lines); i--) { initiatorHash = getInitiatorHash(tabletype, i); executorHash = getExecutorHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); urlHash = getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { urle = load(urlHash, null); plasmaCrawlLURLEntry.Components comp = urle.comp(); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash); urlstr = comp.url().toNormalform(); urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0); prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage); prop.put("table_indexed_" + cnt + "_tabletype", tabletype); prop.put("table_indexed_" + cnt + "_urlhash", urlHash); prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0); prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? dfltInit : initiatorSeed.getName()); prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0); prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName()); prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate())); prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount()); prop.put("table_indexed_" + cnt + "_urldescr", comp.descr()); prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr)); dark = !dark; cnt++; } catch (Exception e) { serverLog.logSevere("PLASMA", "genTableProps", e); } } prop.put("table_indexed", cnt); return prop; } public class kiter implements Iterator { // enumerates entry elements Iterator i; boolean error = false; public kiter(boolean up, boolean rotating, String firstHash) throws IOException { i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); error = false; } public boolean hasNext() { if (error) return false; return i.hasNext(); } public Object next() throws RuntimeException { kelondroRow.Entry e = (kelondroRow.Entry) i.next(); if (e == null) return null; try { if (newdb) return new plasmaCrawlLURLNewEntry(e, null); else return new plasmaCrawlLURLOldEntry(e, null); } catch (IOException ex) { throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } } public void remove() { i.remove(); } } public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { // enumerates entry elements return new kiter(up, rotating, firstHash); } /** * Uses an Iteration over urlHash.db to detect malformed URL-Entries. * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. * * @param homePath Root-Path where all information is to be found. */ public void urldbcleanup() { serverLog log = new serverLog("URLDBCLEANUP"); HashSet damagedURLS = new HashSet(); try { Iterator eiter = entries(true, false, null); int iteratorCount = 0; while (eiter.hasNext()) try { eiter.next(); iteratorCount++; } catch (RuntimeException e) { if(e.getMessage() != null) { String m = e.getMessage(); damagedURLS.add(m.substring(m.length() - 12)); } else { log.logSevere("RuntimeException:", e); } } try { Thread.sleep(1000); } catch (InterruptedException e) { } log.logInfo("URLs vorher: " + size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size()); Iterator eiter2 = damagedURLS.iterator(); String urlHash; while (eiter2.hasNext()) { urlHash = (String) eiter2.next(); // trying to fix the invalid URL httpc theHttpc = null; String oldUrlStr = null; try { // getting the url data as byte array kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); // getting the wrong url string oldUrlStr = entry.getColString(1, null).trim(); int pos = -1; if ((pos = oldUrlStr.indexOf("://")) != -1) { // trying to correct the url String newUrlStr = "http://" + oldUrlStr.substring(pos + 3); URL newUrl = new URL(newUrlStr); // doing a http head request to test if the url is correct theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getHost(), newUrl.getPort(), 30000, false); response res = theHttpc.HEAD(newUrl.getPath(), null); if (res.statusCode == 200) { entry.setCol(1, newUrl.toString().getBytes()); urlIndexFile.put(entry); log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); } else { remove(urlHash); log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status); } } } catch (Exception e) { remove(urlHash); log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); } finally { if (theHttpc != null) try { theHttpc.close(); httpc.returnInstance(theHttpc); } catch (Exception e) { } } } log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size()); } catch (IOException e) { log.logSevere("IOException", e); } } // The Cleaner class was provided as "UrldbCleaner" by Hydrox // see http://www.yacy-forum.de/viewtopic.php?p=18093#18093 public Cleaner makeCleaner() { return new Cleaner(); } public class Cleaner extends Thread { private boolean run = true; private boolean pause = false; public int blacklistedUrls = 0; public int totalSearchedUrls = 1; public String lastBlacklistedUrl = ""; public String lastBlacklistedHash = ""; public String lastUrl = ""; public String lastHash = ""; public Cleaner() { } public void run() { try { serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); Iterator eiter = entries(true, false, null); while (eiter.hasNext() && run) { synchronized(this) { if (this.pause) { try { this.wait(); } catch (InterruptedException e) { serverLog.logWarning("URLDBCLEANER", "InterruptedException", e); this.run = false; return; } } } plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); plasmaCrawlLURLEntry.Components comp = entry.comp(); totalSearchedUrls++; if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) || plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) { lastBlacklistedUrl = comp.url().toNormalform(); lastBlacklistedHash = entry.hash(); serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + comp.url().toNormalform()); remove(entry.hash()); if (blacklistedUrls % 100 == 0) { serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl); } } lastUrl = comp.url().toNormalform(); lastHash = entry.hash(); } } catch (RuntimeException e) { if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL") != -1) { serverLog.logWarning("URLDBCLEANER", "urlHash not found in LURL", e); } else { serverLog.logWarning("URLDBCLEANER", "RuntimeException", e); run = false; } } catch (IOException e) { e.printStackTrace(); run = false; } serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped"); } public void abort() { synchronized(this) { run = false; this.notifyAll(); } } public void pause() { synchronized(this) { if (!pause) { pause = true; serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused"); } } } public void endPause() { synchronized(this) { if (pause) { pause = false; this.notifyAll(); serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed"); } } } } public static void main(String[] args) { // test-generation of url hashes for debugging // one argument requires, will be treated as url // returns url-hash if (args[0].equals("-h")) try { // arg 1 is url System.out.println("HASH: " + urlHash(new URL(args[1]))); } catch (MalformedURLException e) {} if (args[0].equals("-l")) try { // arg 1 is path to URLCache final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString()); } } catch (Exception e) { e.printStackTrace(); } } }