// IndexControl_p.java // ----------------------- // part of the AnomicHTTPD caching proxy // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last change: 02.05.2004 // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // Using this software in any meaning (reading, learning, copying, compiling, // running) means that you agree that the Author(s) is (are) not responsible // for cost, loss of data or any harm that may be caused directly or indirectly // by usage of this softare or this documentation. The usage of this software // is on your own risk. The installation and usage (starting/running) of this // software may allow other people or application to access your computer and // any attached devices and is highly dependent on the configuration of the // software which must be done by the user of the software; the author(s) is // (are) also not responsible for proper configuration and usage of the // software, even if provoked by documentation provided together with // the software. // // Any changes to this file according to the GPL as documented in the file // gpl.txt aside this file in the shipment you received can be done to the // lines that follows this copyright notice here, but changes must not be // done inside the copyright notive above. A re-distribution must contain // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. // You must compile this file with // javac -classpath .:../Classes IndexControl_p.java // if the shell's current path is HTROOT import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Enumeration; import java.util.HashSet; import java.util.HashMap; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.Set; import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; public class IndexControl_p { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); if ((post == null) || (env == null)) { prop.put("keystring", ""); prop.put("keyhash", ""); prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", ""); prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); prop.put("otherHosts", ""); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveBlockBlacklistChecked", (switchboard.getConfig("indexReceiveBlockBlacklist", "true").equals("true")) ? "checked" : ""); return prop; // be save } // default values String keystring = ((String) post.get("keystring")).trim(); String keyhash = ((String) post.get("keyhash")).trim(); String urlstring = ((String) post.get("urlstring")).trim(); String urlhash = ((String) post.get("urlhash")).trim(); if ((!(urlstring.startsWith("http://"))) && (!(urlstring.startsWith("https://")))) urlstring = "http://" + urlstring; prop.put("keystring", keystring); prop.put("keyhash", keyhash); prop.put("urlstring", urlstring); prop.put("urlhash", urlhash); prop.put("result", ""); // read values from checkboxes String[] urlx = post.getAll("urlhx.*"); boolean delurl = post.containsKey("delurl"); boolean delurlref = post.containsKey("delurlref"); //System.out.println("DEBUG CHECK: " + ((delurl) ? "delurl" : "") + " " + ((delurlref) ? "delurlref" : "")); if (post.containsKey("setIndexTransmission")) { boolean allowDistributeIndex = ((String) post.get("indexDistribute", "")).equals("on"); switchboard.setConfig("allowDistributeIndex", (allowDistributeIndex) ? "true" : "false"); if (allowDistributeIndex) switchboard.indexDistribution.enable(); else switchboard.indexDistribution.disable(); boolean allowDistributeIndexWhileCrawling = post.containsKey("indexDistributeWhileCrawling"); switchboard.setConfig("allowDistributeIndexWhileCrawling", (allowDistributeIndexWhileCrawling) ? "true" : "false"); if (allowDistributeIndexWhileCrawling) switchboard.indexDistribution.enableWhileCrawling(); else switchboard.indexDistribution.disableWhileCrawling(); boolean allowReceiveIndex = ((String) post.get("indexReceive", "")).equals("on"); switchboard.setConfig("allowReceiveIndex", (allowReceiveIndex) ? "true" : "false"); yacyCore.seedDB.mySeed.setFlagAcceptRemoteIndex(allowReceiveIndex); boolean indexReceiveBlockBlacklist = ((String) post.get("indexReceiveBlockBlacklist", "")).equals("on"); switchboard.setConfig("indexReceiveBlockBlacklist", (indexReceiveBlockBlacklist) ? "true" : "false"); } if (post.containsKey("keyhashdeleteall")) { if ((delurl) || (delurlref)) { // generate an urlx array try { HashSet keyhashes = new HashSet(); keyhashes.add(keyhash); plasmaWordIndexEntity index = switchboard.searchManager.searchHashes(keyhashes, 10000); Enumeration en = index.elements(true); int i = 0; urlx = new String[index.size()]; while (en.hasMoreElements()) urlx[i++] = ((plasmaWordIndexEntry) en.nextElement()).getUrlHash(); index.close(); } catch (IOException e) { urlx = new String[0]; } } if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]); switchboard.wordIndex.deleteIndex(keyhash); post.remove("keyhashdeleteall"); if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash))) post.put("keystringsearch", "generated"); else post.put("keyhashsearch", "generated"); } if (post.containsKey("keyhashdelete")) { if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.urlPool.loadedURL.remove(urlx[i]); switchboard.wordIndex.removeEntries(keyhash, urlx, true); // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation post.remove("keyhashdelete"); if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash))) post.put("keystringsearch", "generated"); else post.put("keyhashsearch", "generated"); //prop.put("result", "Delete of relation of url hashes " + result + " to key hash " + keyhash); } if (post.containsKey("urlhashdeleteall")) { int i = switchboard.removeAllUrlReferences(urlhash, true); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); } if (post.containsKey("urlhashdelete")) { plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { urlstring = htmlFilterContentScraper.urlNormalform(url); prop.put("urlstring", ""); switchboard.urlPool.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); } } if (post.containsKey("keystringsearch")) { keyhash = plasmaWordIndexEntry.word2hash(keystring); prop.put("keyhash", keyhash); prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", genUrlList(switchboard, keyhash, keystring)); } if (post.containsKey("keyhashsearch")) { if ((keystring.length() == 0) || (!(plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)))) prop.put("keystring", ""); prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", genUrlList(switchboard, keyhash, "")); } if (post.containsKey("keyhashtransfer")) { if ((keystring.length() == 0) || (!(plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)))) prop.put("keystring", ""); prop.put("urlstring", ""); prop.put("urlhash", ""); plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1]; String result; long starttime = System.currentTimeMillis(); indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); // built urlCache Enumeration urlEnum = indexes[0].elements(true); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); plasmaWordIndexEntry indexEntry; plasmaCrawlLURL.Entry lurl; while (urlEnum.hasMoreElements()) { indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); if (lurl == null) { unknownURLEntries.add(indexEntry.getUrlHash()); } else { if (lurl.toString() == null) { switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash()); } else { knownURLs.put(indexEntry.getUrlHash(), lurl); } } } // now delete all entries that have no url entry Iterator hashIter = unknownURLEntries.iterator(); while (hashIter.hasNext()) try { indexes[0].removeEntry((String) hashIter.next(), false); } catch (IOException e) {} // use whats remaining result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, knownURLs); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); try {indexes[0].close();} catch (IOException e) {} } if (post.containsKey("keyhashsimilar")) { Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, true, true); String result = "Sequential List of Word-Hashes:
"; String hash; int i = 0; while ((hashIt.hasNext()) && (i < 256)) { hash = (String) hashIt.next(); result += "" + hash + " " + (((i + 1) % 8 == 0) ? "
" : ""); i++; } prop.put("result", result); } if (post.containsKey("urlstringsearch")) { try { URL url = new URL(urlstring); urlhash = plasmaURL.urlHash(url); prop.put("urlhash", urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } catch (MalformedURLException e) { prop.put("urlstring", "wrong url: " + urlstring); prop.put("urlhash", ""); } } if (post.containsKey("urlhashsearch")) { plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { urlstring = url.toString(); prop.put("urlstring", urlstring); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } } if (post.containsKey("urlhashsimilar")) { try { Iterator hashIt = switchboard.urlPool.loadedURL.urlHashes(urlhash, true); String result = "Sequential List of URL-Hashes:
"; String hash; int i = 0; while ((hashIt.hasNext()) && (i < 256)) { hash = (String) hashIt.next(); result += "" + hash + " " + (((i + 1) % 8 == 0) ? "
" : ""); i++; } prop.put("result", result); } catch (IOException e) { prop.put("result", "Error: " + e.getMessage()); } } //List known hosts yacySeed seed; int hc = 0; if ((yacyCore.seedDB != null) && (yacyCore.seedDB.sizeConnected() > 0)) { Enumeration e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(keyhash); TreeMap hostList = new TreeMap(); while (e.hasMoreElements()) { seed = (yacySeed) e.nextElement(); if (seed != null) hostList.put(seed.get("Name", "nameless"),seed.hash); } String hostName = null; try { while ((hostName = (String) hostList.firstKey()) != null) { prop.put("hosts_" + hc + "_hosthash", hostList.get(hostName)); prop.put("hosts_" + hc + "_hostname", /*seed.hash + " " +*/ hostName); hc++; hostList.remove(hostName); } } catch (NoSuchElementException ex) {} prop.put("hosts", Integer.toString(hc)); } else { prop.put("hosts", "0"); } // insert constants prop.put("wcount", Integer.toString(switchboard.wordIndex.size())); prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size())); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexDistributeWhileCrawling", (switchboard.getConfig("allowDistributeIndexWhileCrawling", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveBlockBlacklistChecked", (switchboard.getConfig("indexReceiveBlockBlacklist", "true").equals("true")) ? "checked" : ""); // return rewrite properties return prop; } public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { if (entry == null) return "No entry found for URL-hash " + urlhash; URL url = entry.url(); if (url == null) return "No entry found for URL-hash " + urlhash; String result = "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
URL String" + htmlFilterContentScraper.urlNormalform(url) + "
Hash" + urlhash + "
Description" + entry.descr() + "
Modified-Date" + entry.moddate() + "
Loaded-Date" + entry.loaddate() + "
Referrer" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "
Doctype" + entry.doctype() + "
Copy-Count" + entry.copyCount() + "
Local-Flag" + entry.local() + "
Quality" + entry.quality() + "
Language" + entry.language() + "
Size" + entry.size() + "
Words" + entry.wordCount() + "

"; result += "
" + "" + "" + "" + "" + "
" + " this may produce unresolved references at other word indexes but they do not harm

" + "
" + " delete the reference to this url at every other word where the reference exists (very extensive, but prevents unresolved references)
" + "
"; return result; } public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) { // search for a word hash and generate a list of url links try { HashSet keyhashes = new HashSet(); keyhashes.add(keyhash); plasmaWordIndexEntity index = switchboard.searchManager.searchHashes(keyhashes, 10000); String result = ""; if (index.size() == 0) { result = "No URL entries related to this word hash " + keyhash + "."; } else { Enumeration en = index.elements(true); plasmaWordIndexEntry ie; result = "URL entries related to this word hash " + keyhash + ":
"; result += "
"; String us, uh; int i = 0; while (en.hasMoreElements()) { ie = (plasmaWordIndexEntry) en.nextElement(); uh = ie.getUrlHash(); result += ""; if (switchboard.urlPool.loadedURL.exists(uh)) { us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString(); result += "" + uh + " " + us + "
"; } else { result += "" + uh + " <unresolved URL Hash>
"; } } result += "" + "" + "" + "" + "
Reference Deletion


" + "

" + "
  (= delete Word)
" + "
" + "

" + "delete also the referenced URL itself (reasonable and recommended, may produce unresolved references at other word indexes but they do not harm)" + "
" + "

" + "for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)" + "
"; } index.close(); return result; } catch (IOException e) { return ""; } } }