mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added default profile for surrogate indexing
- integrated surrogate indexing into indexing queue process git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5810 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
ad78e3a59f
commit
bd5f4c78d8
|
@ -221,6 +221,12 @@ proxyCache = DATA/HTCACHE
|
|||
proxyCacheSize = 100
|
||||
proxyCacheSize__pro = 1024
|
||||
|
||||
# a path to the surrogate input directory
|
||||
surrogates.in = DATA/SURROGATES/in
|
||||
|
||||
# a path to the surrogate output directory
|
||||
surrogates.out = DATA/SURROGATES/out
|
||||
|
||||
# storage place for new releases
|
||||
releases = DATA/RELEASE
|
||||
|
||||
|
|
|
@ -114,7 +114,8 @@ public class CrawlProfileEditor_p {
|
|||
entry selentry;
|
||||
while (it.hasNext()) {
|
||||
selentry = it.next();
|
||||
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) ||
|
||||
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE) ||
|
||||
selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) ||
|
||||
selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_REMOTE) /*||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
|
||||
selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)*/)
|
||||
|
|
|
@ -100,7 +100,8 @@ public class IndexCreateWWWLocalQueue_p {
|
|||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
|
||||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
|
||||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
|
||||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|
||||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
|
||||
name.equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
|
||||
continue;
|
||||
if (compiledPattern.matcher(name).find()) {
|
||||
sb.webIndex.profilesActiveCrawls.removeEntry(entry.handle());
|
||||
|
|
|
@ -41,7 +41,8 @@ public class WatchWebStructure_p {
|
|||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
|
||||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
|
||||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
|
||||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|
||||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
|
||||
e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
|
||||
continue;
|
||||
host = e.name();
|
||||
break; // take the first one
|
||||
|
|
|
@ -85,6 +85,7 @@
|
|||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -120,8 +121,10 @@ import de.anomic.crawler.ResourceObserver;
|
|||
import de.anomic.crawler.ResultImages;
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
import de.anomic.crawler.RobotsTxt;
|
||||
import de.anomic.crawler.Surrogate;
|
||||
import de.anomic.crawler.ZURL;
|
||||
import de.anomic.crawler.CrawlProfile.entry;
|
||||
import de.anomic.crawler.IndexingStack.QueueEntry;
|
||||
import de.anomic.data.Blacklist;
|
||||
import de.anomic.data.URLLicense;
|
||||
import de.anomic.data.blogBoard;
|
||||
|
@ -165,6 +168,7 @@ import de.anomic.server.serverSemaphore;
|
|||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.server.serverThread;
|
||||
import de.anomic.tools.crypt;
|
||||
import de.anomic.xml.SurrogateReader;
|
||||
import de.anomic.yacy.yacyClient;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacyNewsPool;
|
||||
|
@ -200,6 +204,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
public File rankingPath;
|
||||
public File workPath;
|
||||
public File releasePath;
|
||||
public File surrogatesInPath;
|
||||
public File surrogatesOutPath;
|
||||
public Map<String, String> rankingPermissions;
|
||||
public plasmaWordIndex webIndex;
|
||||
public CrawlQueues crawlQueues;
|
||||
|
@ -435,6 +441,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
final long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig(plasmaSwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte
|
||||
plasmaHTCache.init(htCachePath, webIndex.peers().mySeed().hash, maxCacheSize);
|
||||
|
||||
// create the surrogates directories
|
||||
surrogatesInPath = getConfigPath(plasmaSwitchboardConstants.SURROGATES_IN_PATH, plasmaSwitchboardConstants.SURROGATES_IN_PATH_DEFAULT);
|
||||
this.log.logInfo("surrogates.in Path = " + surrogatesInPath.getAbsolutePath());
|
||||
surrogatesInPath.mkdirs();
|
||||
surrogatesOutPath = getConfigPath(plasmaSwitchboardConstants.SURROGATES_OUT_PATH, plasmaSwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT);
|
||||
this.log.logInfo("surrogates.out Path = " + surrogatesOutPath.getAbsolutePath());
|
||||
surrogatesOutPath.mkdirs();
|
||||
|
||||
// create the release download directory
|
||||
releasePath = getConfigPath(plasmaSwitchboardConstants.RELEASE_PATH, plasmaSwitchboardConstants.RELEASE_PATH_DEFAULT);
|
||||
releasePath.mkdirs();
|
||||
|
@ -1166,6 +1180,38 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
}
|
||||
}
|
||||
|
||||
public void processSurrogate(String s) {
|
||||
File surrogateFile = new File(this.surrogatesInPath, s);
|
||||
File outfile = new File(this.surrogatesOutPath, s);
|
||||
try {
|
||||
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)));
|
||||
Thread readerThread = new Thread(reader);
|
||||
readerThread.start();
|
||||
Surrogate surrogate;
|
||||
QueueEntry queueentry;
|
||||
while (reader.hasNext()) {
|
||||
surrogate = reader.next();
|
||||
plasmaParserDocument document = surrogate.document();
|
||||
queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null);
|
||||
/*
|
||||
* public QueueEntry newEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie,
|
||||
final String initiator, final int depth, final String profilehandle, final String anchorName)
|
||||
*/
|
||||
indexingQueueEntry queueEntry = new indexingQueueEntry(queueentry, document, null);
|
||||
try {
|
||||
indexingCondensementProcessor.enQueue(queueEntry);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
surrogateFile.renameTo(outfile);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean deQueueProcess() {
|
||||
try {
|
||||
// work off fresh entries from the proxy or from the crawler
|
||||
|
@ -1178,6 +1224,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
// check for interruption
|
||||
checkInterruption();
|
||||
|
||||
// check surrogates
|
||||
String[] surrogatelist = this.surrogatesInPath.list();
|
||||
if (surrogatelist.length > 0) {
|
||||
// look if the is any xml inside
|
||||
for (int i = 0; i < surrogatelist.length; i++) {
|
||||
if (surrogatelist[i].endsWith(".xml")) {
|
||||
// read the surrogate file and store entry in index
|
||||
processSurrogate(surrogatelist[i]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getting the next entry from the indexing queue
|
||||
if (webIndex.queuePreStack.size() == 0) {
|
||||
|
@ -1297,6 +1356,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|
||||
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
|
||||
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
|
||||
if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE))
|
||||
webIndex.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
|
||||
Long.toString(webIndex.profilesActiveCrawls.getRecrawlDate(plasmaWordIndex.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
|
||||
}
|
||||
} catch (final IOException e) {};
|
||||
|
||||
|
|
|
@ -339,6 +339,12 @@ public final class plasmaSwitchboardConstants {
|
|||
public static final String HTCACHE_PATH_DEFAULT = "DATA/HTCACHE";
|
||||
public static final String RELEASE_PATH = "releases";
|
||||
public static final String RELEASE_PATH_DEFAULT = "DATA/RELEASE";
|
||||
|
||||
public static final String SURROGATES_IN_PATH = "surrogates.in";
|
||||
public static final String SURROGATES_IN_PATH_DEFAULT = "DATA/SURROGATES/in";
|
||||
public static final String SURROGATES_OUT_PATH = "surrogates.out";
|
||||
public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out";
|
||||
|
||||
/**
|
||||
* <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>
|
||||
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
|
||||
|
|
|
@ -86,6 +86,8 @@ public final class plasmaWordIndex {
|
|||
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
|
||||
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
|
||||
|
||||
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
|
||||
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
|
||||
|
||||
|
@ -94,6 +96,7 @@ public final class plasmaWordIndex {
|
|||
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
|
||||
public static final ByteOrder wordOrder = Base64Order.enhancedCoder;
|
||||
|
||||
|
@ -108,6 +111,7 @@ public final class plasmaWordIndex {
|
|||
public CrawlProfile.entry defaultRemoteProfile;
|
||||
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
|
||||
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
|
||||
public CrawlProfile.entry defaultSurrogateProfile;
|
||||
private final File queuesRoot;
|
||||
private IODispatcher<WordReference> merger;
|
||||
|
||||
|
@ -297,6 +301,7 @@ public final class plasmaWordIndex {
|
|||
this.defaultTextSnippetGlobalProfile = null;
|
||||
this.defaultMediaSnippetLocalProfile = null;
|
||||
this.defaultMediaSnippetGlobalProfile = null;
|
||||
this.defaultSurrogateProfile = null;
|
||||
final Iterator<CrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
|
||||
CrawlProfile.entry profile;
|
||||
String name;
|
||||
|
@ -310,6 +315,7 @@ public final class plasmaWordIndex {
|
|||
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile;
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
this.profilesActiveCrawls.clear();
|
||||
|
@ -319,6 +325,7 @@ public final class plasmaWordIndex {
|
|||
this.defaultTextSnippetGlobalProfile = null;
|
||||
this.defaultMediaSnippetLocalProfile = null;
|
||||
this.defaultMediaSnippetGlobalProfile = null;
|
||||
this.defaultSurrogateProfile = null;
|
||||
}
|
||||
|
||||
if (this.defaultProxyProfile == null) {
|
||||
|
@ -356,6 +363,11 @@ public final class plasmaWordIndex {
|
|||
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
|
||||
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
|
||||
}
|
||||
if (this.defaultSurrogateProfile == null) {
|
||||
// generate new default entry for surrogate parsing
|
||||
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
|
||||
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false);
|
||||
}
|
||||
}
|
||||
|
||||
private void resetProfiles() {
|
||||
|
@ -387,7 +399,8 @@ public final class plasmaWordIndex {
|
|||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)))) {
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
|
||||
profilesPassiveCrawls.newEntry(entry.map());
|
||||
iter.remove();
|
||||
hasDoneSomething = true;
|
||||
|
|
|
@ -101,7 +101,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
public void endElement(final String uri, final String name, final String tag) {
|
||||
if (tag == null) return;
|
||||
if ("document".equals(tag)) {
|
||||
//System.out.println("A Title: " + this.surrogate.title());
|
||||
this.surrogates.add(this.surrogate);
|
||||
//System.out.println("B Title: " + this.surrogate.title());
|
||||
this.surrogate = null;
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = false;
|
||||
|
@ -150,6 +152,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
Thread t = new Thread(sr);
|
||||
t.start();
|
||||
Surrogate s;
|
||||
System.out.println("1");
|
||||
while (sr.hasNext()) {
|
||||
s = sr.next();
|
||||
System.out.println("Title: " + s.title());
|
||||
|
@ -159,6 +162,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
System.out.println("Body: " + s.body());
|
||||
System.out.println("Categories: " + s.categories());
|
||||
}
|
||||
System.out.println("2");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user