// FTPLoader.java // ------------------------------------- // part of YACY // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2006 // // This file ist contributed by Martin Thelian // // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ // $LastChangedRevision: 1715 $ // $LastChangedBy: theli $ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.crawler.retrieval; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Date; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import de.anomic.crawler.Latency; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; import de.anomic.net.ftpc; import de.anomic.search.Segments; import de.anomic.search.Switchboard; public class FTPLoader { private final Switchboard sb; private final Log log; private final int maxFileSize; public FTPLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } /** * Loads the entry from a ftp-server * * @param request * @return */ public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { long start = System.currentTimeMillis(); final DigestURI entryUrl = request.url(); final String fullPath = getPath(entryUrl); // the return value Response response = null; // determine filename and path String file, path; if (fullPath.endsWith("/")) { file = ""; path = fullPath; } else { final int pos = fullPath.lastIndexOf("/"); if (pos == -1) { file = fullPath; path = "/"; } else { path = fullPath.substring(0, pos + 1); file = fullPath.substring(pos + 1); } } assert path.endsWith("/") : "FTPLoader: path is not a path: '" + path + "'"; // stream for ftp-client errors final ByteArrayOutputStream berr = new ByteArrayOutputStream(); // create new ftp client final PrintStream err = new PrintStream(berr); final ftpc ftpClient = new ftpc(System.in, null, err); ftpClient.setDataTimeoutByMaxFilesize(maxFileSize); // get a connection if (openConnection(ftpClient, entryUrl)) { // test if the specified file is a directory if (file.length() > 0) { ftpClient.exec("cd \"" + path + "\"", false); final boolean isFolder = ftpClient.isFolder(file); if (isFolder) { path = fullPath + "/"; file = ""; } } if (file.length() == 0) { // directory -> get list of files RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { DigestURI u = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true, false)); } StringBuilder dirList = ftpClient.dirhtml(path); if (dirList == null) { response = null; } else { ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); response = new Response( request, requestHeader, responseHeader, "200", sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), dirList.toString().getBytes()); } } else { // file -> download try { response = getFile(ftpClient, request, acceptOnlyParseable); } catch (final Exception e) { // add message to errorLog (new PrintStream(berr)).print(e.getMessage()); } } closeConnection(ftpClient); } // pass the downloaded resource to the cache manager if (berr.size() > 0 || response == null) { // some error logging final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : ""; sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server download" + detail); throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail); } Latency.update(new String(request.url().hash()).substring(6), request.url().getHost(), System.currentTimeMillis() - start); return response; } /** * @param ftpClient */ private void closeConnection(final ftpc ftpClient) { // closing connection ftpClient.exec("close", false); ftpClient.exec("exit", false); } /** * establish a connection to the ftp server (open, login, set transfer mode) */ private boolean openConnection(final ftpc ftpClient, final DigestURI entryUrl) { // get username and password final String userInfo = entryUrl.getUserInfo(); String userName = "anonymous", userPwd = "anonymous"; if (userInfo != null) { final int pos = userInfo.indexOf(":"); if (pos != -1) { userName = userInfo.substring(0, pos); userPwd = userInfo.substring(pos + 1); } } // get server name and port final String host = entryUrl.getHost(); final int port = entryUrl.getPort(); // open a connection to the ftp server if (port == -1) { ftpClient.exec("open " + host, false); } else { ftpClient.exec("open " + host + " " + port, false); } if (ftpClient.notConnected()) { return false; } // login to the server ftpClient.exec("user " + userName + " " + userPwd, false); if (ftpClient.isLoggedIn()) { // change transfer mode to binary ftpClient.exec("binary", false); } else { return false; } return true; } private Response getFile(final ftpc ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception { // determine the mimetype of the resource final DigestURI url = request.url(); final String mime = TextParser.mimeOf(url); final String path = getPath(url); // determine the file date final Date fileDate = ftpClient.entryDate(path); // create response header RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); // if the mimetype and file extension is supported we start to download the file final int size = ftpClient.fileSize(path); String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || (size > maxFileSize && maxFileSize >= 0)) { // we know that we cannot process that file before loading // only the metadata is returned if (parserError != null) { log.logInfo("No parser available in FTP crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); } else { log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); } // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); Response response = new Response( request, requestHeader, responseHeader, "200", sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), url.toNormalform(true, true).getBytes()); return response; } // timeout for download ftpClient.setDataTimeoutByMaxFilesize(size); // download the remote file byte[] b = ftpClient.get(path); // create a response Response response = new Response( request, requestHeader, responseHeader, "200", sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), b); return response; } /** * gets path suitable for FTP (url-decoded, double-quotes escaped) * * @param entryUrl * @return */ private String getPath(final MultiProtocolURI entryUrl) { return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); } }