yacy_search_server/htroot/CrawlProfileEditor_p.java
orbiter 3a807e10cf - added a cache for active crawl profiles to the crawl switchboard
- moved the domain cache for domain counter from the crawl switchboard to the crawl profiles. the crawl domain counter is now therefore relative for each crawl start, not for the whole crawler.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8018 6c8d7289-2bf4-0310-a012-ef5d649a1542
2011-11-08 15:38:08 +00:00

283 lines
14 KiB
Java

// CrawlProfileEditor_p.java
// (C) 2005, by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 04.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
public class CrawlProfileEditor_p {
private final static String CRAWL_PROFILE_PREFIX = "crawlProfiles_";
private static final String EDIT_ENTRIES_PREFIX = "edit_entries_";
private static final Set<String> ignoreNames = new HashSet<String>();
static {
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE);
ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
}
public static class eentry {
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
public static final int STRING = 2;
public final String name;
public final String label;
public final boolean readonly;
public final int type;
public eentry(final String name, final String label, final boolean readonly, final int type) {
this.name = name;
this.label = label;
this.readonly = readonly;
this.type = type;
}
}
private static final List <eentry> labels = new ArrayList<eentry>();
static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
}
public static serverObjects respond(
final RequestHeader header,
final serverObjects post,
final serverSwitch env) {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard)env;
// read post for handle
final String handle = (post == null) ? "" : post.get("handle", "");
if (post != null) {
if (post.containsKey("terminate")) try {
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
if (post.containsKey("delete")) {
// deletion of a terminated crawl profile
sb.crawler.removePassive(handle.getBytes());
}
if (post.containsKey("deleteTerminatedProfiles")) {
for (final byte[] h: sb.crawler.getPassive()) {
sb.crawler.removePassive(h);
}
}
}
// generate handle list: first sort by handle name
CrawlProfile selentry;
final Map<String, String> orderdHandles = new TreeMap<String, String>();
for (final byte[] h : sb.crawler.getActive()) {
selentry = sb.crawler.getActive(h);
if (selentry != null && !ignoreNames.contains(selentry.name())) {
orderdHandles.put(selentry.name(), selentry.handle());
}
}
// then write into pop-up menu list
int count = 0;
for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) {
prop.put("profiles_" + count + "_name", NameHandle.getKey());
prop.put("profiles_" + count + "_handle", NameHandle.getValue());
if (handle.equals(NameHandle.getValue())) {
prop.put("profiles_" + count + "_selected", "1");
}
count++;
}
prop.put("profiles", count);
selentry = sb.crawler.getActive(handle.getBytes());
assert selentry == null || selentry.handle() != null;
// read post for change submit
if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) {
try {
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL_STRING));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER_STRING));
final Iterator<eentry> lit = labels.iterator();
eentry tee;
while (lit.hasNext()) {
tee = lit.next();
final String cval = selentry.get(tee.name);
final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) {
selentry.put(tee.name, val);
sb.crawler.putActive(selentry.handle().getBytes(), selentry);
}
}
} catch (final Exception ex) {
Log.logException(ex);
prop.put("error", "1");
prop.putHTML("error_message", ex.getMessage());
}
}
}
// generate crawl profile table
count = 0;
boolean dark = true;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile profile;
// put active crawls into list
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
putProfileEntry(prop, sb.crawlStacker, profile, true, dark, count, domlistlength);
dark = !dark;
count++;
}
// put passive crawls into list
boolean existPassiveCrawls = false;
for (final byte[] h: sb.crawler.getPassive()) {
profile = sb.crawler.getPassive(h);
putProfileEntry(prop, sb.crawlStacker, profile, false, dark, count, domlistlength);
dark = !dark;
count++;
existPassiveCrawls = true;
}
prop.put("crawlProfiles", count);
prop.put("existPassiveCrawls", existPassiveCrawls ? "1" : "0");
// generate edit field
if (selentry == null) {
prop.put("edit", "0");
} else {
prop.put("edit", "1");
prop.put("edit_name", selentry.name());
prop.put("edit_handle", selentry.handle());
final Iterator<eentry> lit = labels.iterator();
count = 0;
while (lit.hasNext()) {
final eentry ee = lit.next();
final String val = selentry.get(ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0");
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type", ee.type);
if (ee.type == eentry.BOOLEAN) {
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked",
Boolean.parseBoolean(val) ? "1" : "0");
} else {
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_value", val);
}
count++;
}
prop.put("edit_entries", count);
}
return prop;
}
private static void putProfileEntry(
final servletProperties prop,
final CrawlStacker crawlStacker,
final CrawlProfile profile,
final boolean active,
final boolean dark,
final int count,
final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(profile.name())) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", profile.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", profile.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
int i = 0;
if (active && profile.domMaxPages() > 0
&& profile.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = profile.domName(true, i)).isEmpty()){
if (i == domlistlength) {
item += " ...";
}
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
i++;
}
}
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (profile.crawlingQ()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", (profile.storeHTCache()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexText", (profile.indexText()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_indexMedia", (profile.indexMedia()) ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", (profile.remoteIndexing()) ? "1" : "0");
}
}