yacy_search_server/source/de/anomic/crawler/FTPLoader.java
orbiter 826ca79735 refactoring and new architecture to store the files of the web cache:
- files are not stored any more as individual files
- a new database structure using BLOBHeap files stores many cache entries in common files
- all file-writing procedures had been migrated to generate byte[] objects which are written with the new database methods

this is only an intermediate step to the final architecture, where cached files are written together with their metadata in one single database structure.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5276 6c8d7289-2bf4-0310-a012-ef5d649a1542
2008-10-16 21:24:09 +00:00

307 lines
11 KiB
Java

// CrawlerWorker.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.util.Date;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.net.ftpc;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyURL;
public class FTPLoader {
private final plasmaSwitchboard sb;
private final serverLog log;
private final int maxFileSize;
public FTPLoader(final plasmaSwitchboard sb, final serverLog log) {
this.sb = sb;
this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
protected indexDocumentMetadata createCacheEntry(final CrawlEntry entry, final String mimeType, final Date fileDate) {
httpRequestHeader requestHeader = new httpRequestHeader();
requestHeader.put(httpRequestHeader.REFERER, sb.getURL(entry.referrerhash()).toNormalform(true, false));
httpResponseHeader responseHeader = new httpResponseHeader();
responseHeader.put(httpResponseHeader.LAST_MODIFIED, JakartaCommonsHttpClient.date2String(fileDate));
responseHeader.put(httpResponseHeader.CONTENT_TYPE, mimeType);
indexDocumentMetadata metadata = new httpdProxyCacheEntry(entry.depth(), entry.url(), entry.name(), "OK",
requestHeader, responseHeader,
entry.initiator(), sb.webIndex.profilesActiveCrawls.getEntry(entry.profileHandle()));
plasmaHTCache.storeMetadata(responseHeader, metadata);
return metadata;
}
/**
* Loads the entry from a ftp-server
*
* @param entry
* @return
*/
public indexDocumentMetadata load(final CrawlEntry entry) {
final yacyURL entryUrl = entry.url();
final String fullPath = getPath(entryUrl);
// the return value
indexDocumentMetadata htCache = null;
// determine filename and path
String file, path;
if (fullPath.endsWith("/")) {
file = "";
path = fullPath;
} else {
final int pos = fullPath.lastIndexOf("/");
if (pos == -1) {
file = fullPath;
path = "/";
} else {
path = fullPath.substring(0, pos + 1);
file = fullPath.substring(pos + 1);
}
}
assert path.endsWith("/") : "FTPLoader: path is not a path: '" + path + "'";
// stream for ftp-client errors
final ByteArrayOutputStream berr = new ByteArrayOutputStream();
final ftpc ftpClient = createFTPClient(berr);
if (openConnection(ftpClient, entryUrl)) {
// ftp stuff
try {
// testing if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);
final boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
path = fullPath + "/";
file = "";
}
}
if (file.length() == 0) {
// directory -> get list of files
// create a htcache entry
htCache = createCacheEntry(entry, "text/html", new Date());
byte[] dirList = generateDirlist(ftpClient, entry, path);
if (dirList == null) {
htCache = null;
}
} else {
// file -> download
try {
htCache = getFile(ftpClient, entry);
} catch (final Exception e) {
// add message to errorLog
(new PrintStream(berr)).print(e.getMessage());
}
}
} finally {
closeConnection(ftpClient);
}
}
// pass the downloaded resource to the cache manager
if (berr.size() > 0 || htCache == null) {
// some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : "";
log.logWarning("Unable to download URL " + entry.url().toString() + detail);
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_SERVER_DOWNLOAD_ERROR);
}
return htCache;
}
/**
* @param ftpClient
*/
private void closeConnection(final ftpc ftpClient) {
// closing connection
ftpClient.exec("close", false);
ftpClient.exec("exit", false);
}
/**
* establish a connection to the ftp server (open, login, set transfer mode)
*
* @param ftpClient
* @param hostname
* @param port
* @return success
*/
private boolean openConnection(final ftpc ftpClient, final yacyURL entryUrl) {
// get username and password
final String userInfo = entryUrl.getUserInfo();
String userName = "anonymous", userPwd = "anonymous";
if (userInfo != null) {
final int pos = userInfo.indexOf(":");
if (pos != -1) {
userName = userInfo.substring(0, pos);
userPwd = userInfo.substring(pos + 1);
}
}
// get server name and port
final String host = entryUrl.getHost();
final int port = entryUrl.getPort();
// open a connection to the ftp server
if (port == -1) {
ftpClient.exec("open " + host, false);
} else {
ftpClient.exec("open " + host + " " + port, false);
}
if (ftpClient.notConnected()) {
return false;
}
// login to the server
ftpClient.exec("user " + userName + " " + userPwd, false);
if (ftpClient.isLoggedIn()) {
// change transfer mode to binary
ftpClient.exec("binary", false);
} else {
return false;
}
return true;
}
/**
* @param ftpClient
* @param entry
* @param htCache
* @param cacheFile
* @return
* @throws Exception
*/
private indexDocumentMetadata getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
final String extension = plasmaParser.getFileExt(entryUrl);
final String mimeType = plasmaParser.getMimeTypeByFileExt(extension);
final String path = getPath(entryUrl);
// if the mimetype and file extension is supported we start to download
// the file
indexDocumentMetadata htCache = null;
if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER, entryUrl, mimeType)) {
// aborting download if content is too long
final int size = ftpClient.fileSize(path);
if (size <= maxFileSize || maxFileSize == -1) {
// timeout for download
ftpClient.setDataTimeoutByMaxFilesize(size);
// determine the file date
final Date fileDate = ftpClient.entryDate(path);
// create a htcache entry
htCache = createCacheEntry(entry, mimeType, fileDate);
// download the remote file
byte[] b = ftpClient.get(path);
htCache.setCacheArray(b);
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1,
ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
throw new Exception("file size exceeds limit");
}
} else {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_WRONG_MIMETYPE_OR_EXT);
throw new Exception("response has not the right file type -> rejected");
}
return htCache;
}
/**
* gets path suitable for FTP (url-decoded, double-quotes escaped)
*
* @param entryUrl
* @return
*/
private String getPath(final yacyURL entryUrl) {
return yacyURL.unescape(entryUrl.getPath()).replace("\"", "\"\"");
}
/**
* @param ftpClient
* @param entry
* @param cacheFile
* @return
*/
private byte[] generateDirlist(final ftpc ftpClient, final CrawlEntry entry, final String path) {
// getting the dirlist
final yacyURL entryUrl = entry.url();
// generate the dirlist
final StringBuilder dirList = ftpClient.dirhtml(path);
if (dirList != null && dirList.length() > 0) {
try {
return dirList.toString().getBytes();
} catch (final Exception e) {
log.logInfo("Unable to write dirlist for URL " + entryUrl.toString());
}
}
return null;
}
/**
* create a new ftp client
*
* @param berr
* @return
*/
private ftpc createFTPClient(final ByteArrayOutputStream berr) {
// error
final PrintStream err = new PrintStream(berr);
final ftpc ftpClient = new ftpc(System.in, null, err);
// set timeout
ftpClient.setDataTimeoutByMaxFilesize(maxFileSize);
return ftpClient;
}
}