yacy_search_server/source/de/anomic/crawler/NoticeURLImporter.java
orbiter 99bf0b8e41 refactoring of plasmaWordIndex:
divided that class into three parts:
- the peers object is now hosted by the plasmaSwitchboard
- the crawler elements are now in a new class, crawler.CrawlerSwitchboard
- the index elements are core of the new segment data structure, which is a bundle of different indexes for the full text and (in the future) navigation indexes and the metadata store. The new class is now in kelondro.text.Segment

The refactoring is inspired by the roadmap to create index segments, the option to host different indexes on one peer.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5990 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-05-28 14:26:05 +00:00

226 lines
11 KiB
Java

package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlSwitchboard;
public class NoticeURLImporter extends AbstractImporter implements Importer {
private File plasmaPath = null;
private final HashSet<String> importProfileHandleCache = new HashSet<String>();
private CrawlProfile importProfileDB;
private final NoticedURL importNurlDB;
private final int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
private final CrawlQueues crawlQueues;
private final CrawlProfile activeCrawls;
private final ImporterManager dbImportManager;
public NoticeURLImporter(final File crawlerPath, final CrawlQueues crawlQueues, final CrawlProfile activeCrawls, final ImporterManager dbImportManager) {
super("NURL");
this.crawlQueues = crawlQueues;
this.activeCrawls = activeCrawls;
this.dbImportManager = dbImportManager;
// TODO: we need more error handling here
this.plasmaPath = crawlerPath;
final File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
final File profileDbFile = new File(plasmaPath, CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
String errorMsg = null;
if (!plasmaPath.exists())
errorMsg = "The import path '" + plasmaPath + "' does not exist.";
else if (!plasmaPath.isDirectory())
errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
else if (!plasmaPath.canRead())
errorMsg = "The import path '" + plasmaPath + "' is not readable.";
else if (!plasmaPath.canWrite())
errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
else if (!noticeUrlDbFile.exists())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
else if (noticeUrlDbFile.isDirectory())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file.";
else if (!noticeUrlDbFile.canRead())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable.";
else if (!noticeUrlDbFile.canWrite())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable.";
else if (!profileDbFile.exists())
errorMsg = "The profileDB file '" + profileDbFile + "' does not exist.";
else if (profileDbFile.isDirectory())
errorMsg = "The profileDB file '" + profileDbFile + "' is not a file.";
else if (!profileDbFile.canRead())
errorMsg = "The profileDB file '" + profileDbFile + "' is not readable.";
// else if (!profileDbFile.canWrite())
// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable.";
if (errorMsg != null) {
this.log.logSevere(errorMsg);
throw new IllegalArgumentException(errorMsg);
}
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new NoticedURL(plasmaPath);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
// init profile DB
this.log.logInfo("Initializing the source profileDB");
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e) {
FileUtils.deletedelete(profileDbFile);
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e1) {
e1.printStackTrace();
this.importProfileDB = null;
}
}
}
public long getEstimatedTime() {
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
}
public String getJobName() {
return this.plasmaPath.toString();
}
public int getProcessingStatusPercent() {
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
public String getStatus() {
final StringBuilder theStatus = new StringBuilder();
theStatus.append("#URLs=").append(this.urlCount).append("\n");
theStatus.append("#Profiles=").append(this.profileCount);
return theStatus.toString();
}
public void run() {
try {
// waiting on init thread to finish
//this.importNurlDB.waitOnInitThread();
// the stack types we want to import
final int[] stackTypes = new int[] {
NoticedURL.STACK_TYPE_CORE,
NoticedURL.STACK_TYPE_LIMIT,
NoticedURL.STACK_TYPE_REMOTE,
-1};
// looping through the various stacks
for (int stackType=0; stackType< stackTypes.length; stackType++) {
if (stackTypes[stackType] != -1) {
this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries.");
} else {
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
}
// getting an iterator and loop through the URL entries
final Iterator<CrawlEntry> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) {
String nextHash = null;
CrawlEntry nextEntry = null;
try {
if (stackTypes[stackType] != -1) {
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null);
nextHash = nextEntry.url().hash();
} else {
if (!entryIter.hasNext()) break;
this.urlCount++;
nextEntry = entryIter.next();
nextHash = nextEntry.url().hash();
}
} catch (final IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString());
if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break;
continue;
}
// getting a handler to the crawling profile the url belongs to
try {
final String profileHandle = nextEntry.profileHandle();
if (profileHandle == null) {
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
continue;
}
// if we havn't imported the profile until yet we need to do it now
if (!this.importProfileHandleCache.contains(profileHandle)) {
// testing if the profile is already known
final CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle);
// if not we need to import it
if (profileEntry == null) {
// copy and store the source profile entry into the destination db
final CrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle);
if (sourceEntry != null) {
this.profileCount++;
this.importProfileHandleCache.add(profileHandle);
HashMap<String, String> mapclone = new HashMap<String, String>();
mapclone.putAll(sourceEntry.map());
this.activeCrawls.newEntry((HashMap<String, String>) mapclone);
} else {
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
continue;
}
}
}
// if the url does not alredy exists in the destination stack we insert it now
if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) {
this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry);
}
// removing hash from the import db
} finally {
this.importNurlDB.removeByURLHash(nextHash);
}
if (this.urlCount % 100 == 0) {
if (this.log.isFine()) this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far.");
}
if (this.isAborted()) break;
}
this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'");
}
//int size = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
// TODO: what todo with nurlDB entries that do not exist in any stack?
} catch (final Exception e) {
this.error = e.toString();
this.log.logSevere("Import process had detected an error",e);
} finally {
this.log.logInfo("Import process finished.");
this.globalEnd = System.currentTimeMillis();
this.dbImportManager.finishedJobs.add(this);
this.importNurlDB.close();
this.importProfileDB.close();
}
}
}