- added default profile for surrogate indexing

- integrated surrogate indexing into indexing queue process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5810 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-16 08:01:38 +00:00
parent ad78e3a59f
commit bd5f4c78d8
8 changed files with 98 additions and 4 deletions

View File

@ -221,6 +221,12 @@ proxyCache = DATA/HTCACHE
proxyCacheSize = 100
proxyCacheSize__pro = 1024
# a path to the surrogate input directory
surrogates.in = DATA/SURROGATES/in
# a path to the surrogate output directory
surrogates.out = DATA/SURROGATES/out
# storage place for new releases
releases = DATA/RELEASE

View File

@ -114,7 +114,8 @@ public class CrawlProfileEditor_p {
entry selentry;
while (it.hasNext()) {
selentry = it.next();
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) ||
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE) ||
selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) ||
selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_REMOTE) /*||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)*/)

View File

@ -100,7 +100,8 @@ public class IndexCreateWWWLocalQueue_p {
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
continue;
if (compiledPattern.matcher(name).find()) {
sb.webIndex.profilesActiveCrawls.removeEntry(entry.handle());

View File

@ -41,7 +41,8 @@ public class WatchWebStructure_p {
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
continue;
host = e.name();
break; // take the first one

View File

@ -85,6 +85,7 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@ -120,8 +121,10 @@ import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.Surrogate;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.IndexingStack.QueueEntry;
import de.anomic.data.Blacklist;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
@ -165,6 +168,7 @@ import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverThread;
import de.anomic.tools.crypt;
import de.anomic.xml.SurrogateReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
@ -200,6 +204,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public File rankingPath;
public File workPath;
public File releasePath;
public File surrogatesInPath;
public File surrogatesOutPath;
public Map<String, String> rankingPermissions;
public plasmaWordIndex webIndex;
public CrawlQueues crawlQueues;
@ -435,6 +441,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig(plasmaSwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte
plasmaHTCache.init(htCachePath, webIndex.peers().mySeed().hash, maxCacheSize);
// create the surrogates directories
surrogatesInPath = getConfigPath(plasmaSwitchboardConstants.SURROGATES_IN_PATH, plasmaSwitchboardConstants.SURROGATES_IN_PATH_DEFAULT);
this.log.logInfo("surrogates.in Path = " + surrogatesInPath.getAbsolutePath());
surrogatesInPath.mkdirs();
surrogatesOutPath = getConfigPath(plasmaSwitchboardConstants.SURROGATES_OUT_PATH, plasmaSwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT);
this.log.logInfo("surrogates.out Path = " + surrogatesOutPath.getAbsolutePath());
surrogatesOutPath.mkdirs();
// create the release download directory
releasePath = getConfigPath(plasmaSwitchboardConstants.RELEASE_PATH, plasmaSwitchboardConstants.RELEASE_PATH_DEFAULT);
releasePath.mkdirs();
@ -1166,6 +1180,38 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
}
public void processSurrogate(String s) {
File surrogateFile = new File(this.surrogatesInPath, s);
File outfile = new File(this.surrogatesOutPath, s);
try {
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)));
Thread readerThread = new Thread(reader);
readerThread.start();
Surrogate surrogate;
QueueEntry queueentry;
while (reader.hasNext()) {
surrogate = reader.next();
plasmaParserDocument document = surrogate.document();
queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null);
/*
* public QueueEntry newEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie,
final String initiator, final int depth, final String profilehandle, final String anchorName)
*/
indexingQueueEntry queueEntry = new indexingQueueEntry(queueentry, document, null);
try {
indexingCondensementProcessor.enQueue(queueEntry);
} catch (InterruptedException e) {
e.printStackTrace();
break;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
surrogateFile.renameTo(outfile);
}
}
public boolean deQueueProcess() {
try {
// work off fresh entries from the proxy or from the crawler
@ -1178,6 +1224,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// check for interruption
checkInterruption();
// check surrogates
String[] surrogatelist = this.surrogatesInPath.list();
if (surrogatelist.length > 0) {
// look if the is any xml inside
for (int i = 0; i < surrogatelist.length; i++) {
if (surrogatelist[i].endsWith(".xml")) {
// read the surrogate file and store entry in index
processSurrogate(surrogatelist[i]);
return true;
}
}
}
// getting the next entry from the indexing queue
if (webIndex.queuePreStack.size() == 0) {
@ -1297,6 +1356,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
}
} catch (final IOException e) {};

View File

@ -339,6 +339,12 @@ public final class plasmaSwitchboardConstants {
public static final String HTCACHE_PATH_DEFAULT = "DATA/HTCACHE";
public static final String RELEASE_PATH = "releases";
public static final String RELEASE_PATH_DEFAULT = "DATA/RELEASE";
public static final String SURROGATES_IN_PATH = "surrogates.in";
public static final String SURROGATES_IN_PATH_DEFAULT = "DATA/SURROGATES/in";
public static final String SURROGATES_OUT_PATH = "surrogates.out";
public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out";
/**
* <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all

View File

@ -86,6 +86,8 @@ public final class plasmaWordIndex {
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
@ -94,6 +96,7 @@ public final class plasmaWordIndex {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final ByteOrder wordOrder = Base64Order.enhancedCoder;
@ -108,6 +111,7 @@ public final class plasmaWordIndex {
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile.entry defaultSurrogateProfile;
private final File queuesRoot;
private IODispatcher<WordReference> merger;
@ -297,6 +301,7 @@ public final class plasmaWordIndex {
this.defaultTextSnippetGlobalProfile = null;
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
final Iterator<CrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
CrawlProfile.entry profile;
String name;
@ -310,6 +315,7 @@ public final class plasmaWordIndex {
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile;
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile;
if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile;
}
} catch (final Exception e) {
this.profilesActiveCrawls.clear();
@ -319,6 +325,7 @@ public final class plasmaWordIndex {
this.defaultTextSnippetGlobalProfile = null;
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
}
if (this.defaultProxyProfile == null) {
@ -356,6 +363,11 @@ public final class plasmaWordIndex {
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false);
}
}
private void resetProfiles() {
@ -387,7 +399,8 @@ public final class plasmaWordIndex {
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)))) {
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
profilesPassiveCrawls.newEntry(entry.map());
iter.remove();
hasDoneSomething = true;

View File

@ -101,7 +101,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("document".equals(tag)) {
//System.out.println("A Title: " + this.surrogate.title());
this.surrogates.add(this.surrogate);
//System.out.println("B Title: " + this.surrogate.title());
this.surrogate = null;
this.buffer.setLength(0);
this.parsingValue = false;
@ -150,6 +152,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
Thread t = new Thread(sr);
t.start();
Surrogate s;
System.out.println("1");
while (sr.hasNext()) {
s = sr.next();
System.out.println("Title: " + s.title());
@ -159,6 +162,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
System.out.println("Body: " + s.body());
System.out.println("Categories: " + s.categories());
}
System.out.println("2");
} catch (IOException e) {
e.printStackTrace();
}