2009-09-23 23:26:14 +02:00
// FTPLoader.java
2006-09-04 16:38:29 +02:00
// -------------------------------------
// part of YACY
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2006-09-04 16:38:29 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
2011-03-08 02:51:51 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2006-09-04 16:38:29 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-07-15 23:07:46 +02:00
package de.anomic.crawler.retrieval ;
2006-09-04 16:38:29 +02:00
import java.io.ByteArrayOutputStream ;
2008-11-22 01:40:18 +01:00
import java.io.IOException ;
2006-09-04 16:38:29 +02:00
import java.io.PrintStream ;
2006-09-07 06:12:52 +02:00
import java.util.Date ;
2006-09-04 16:38:29 +02:00
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2010-08-23 00:32:39 +02:00
import net.yacy.cora.protocol.ftp.FTPClient ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.TextParser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2010-08-31 17:47:47 +02:00
import de.anomic.crawler.CrawlProfile ;
2009-07-15 23:07:46 +02:00
import de.anomic.crawler.Latency ;
2011-05-26 12:57:02 +02:00
import de.anomic.crawler.ZURL.FailCategory ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segments ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
2006-09-04 16:38:29 +02:00
2008-05-06 02:32:41 +02:00
public class FTPLoader {
2006-09-04 16:38:29 +02:00
2010-12-11 01:31:57 +01:00
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
2009-07-19 22:37:44 +02:00
private final Switchboard sb ;
2009-01-31 00:33:47 +01:00
private final Log log ;
2010-12-11 01:31:57 +01:00
private final long maxFileSize ;
2008-03-14 13:35:53 +01:00
2009-07-19 22:37:44 +02:00
public FTPLoader ( final Switchboard sb , final Log log ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
this . log = log ;
2010-12-11 01:31:57 +01:00
this . maxFileSize = sb . getConfigLong ( " crawler.ftp.maxFileSize " , - 1l ) ;
2006-09-04 16:38:29 +02:00
}
2008-03-14 17:28:27 +01:00
/ * *
* Loads the entry from a ftp - server
*
2009-07-19 23:59:29 +02:00
* @param request
2008-03-14 17:28:27 +01:00
* @return
* /
2010-03-11 16:43:06 +01:00
public Response load ( final Request request , boolean acceptOnlyParseable ) throws IOException {
2009-03-20 11:21:23 +01:00
long start = System . currentTimeMillis ( ) ;
2009-10-11 02:12:19 +02:00
final DigestURI entryUrl = request . url ( ) ;
2008-03-15 22:57:55 +01:00
final String fullPath = getPath ( entryUrl ) ;
// the return value
2009-07-19 23:59:29 +02:00
Response response = null ;
2008-03-14 17:28:27 +01:00
// determine filename and path
String file , path ;
if ( fullPath . endsWith ( " / " ) ) {
file = " " ;
path = fullPath ;
} else {
final int pos = fullPath . lastIndexOf ( " / " ) ;
if ( pos = = - 1 ) {
file = fullPath ;
path = " / " ;
} else {
path = fullPath . substring ( 0 , pos + 1 ) ;
file = fullPath . substring ( pos + 1 ) ;
2006-09-07 06:12:52 +02:00
}
2008-03-14 13:35:53 +01:00
}
2008-03-14 17:28:27 +01:00
assert path . endsWith ( " / " ) : " FTPLoader: path is not a path: ' " + path + " ' " ;
2008-03-14 13:35:53 +01:00
2008-03-14 17:28:27 +01:00
// stream for ftp-client errors
final ByteArrayOutputStream berr = new ByteArrayOutputStream ( ) ;
2008-03-14 13:35:53 +01:00
2010-03-11 16:43:06 +01:00
// create new ftp client
2010-08-23 00:51:31 +02:00
final FTPClient ftpClient = new FTPClient ( ) ;
2010-03-11 16:43:06 +01:00
// get a connection
2008-03-15 22:57:55 +01:00
if ( openConnection ( ftpClient , entryUrl ) ) {
2010-03-11 16:43:06 +01:00
// test if the specified file is a directory
if ( file . length ( ) > 0 ) {
ftpClient . exec ( " cd \" " + path + " \" " , false ) ;
final boolean isFolder = ftpClient . isFolder ( file ) ;
if ( isFolder ) {
path = fullPath + " / " ;
file = " " ;
2006-09-07 07:22:35 +02:00
}
2010-03-11 16:43:06 +01:00
}
2006-09-07 07:22:35 +02:00
2010-03-11 16:43:06 +01:00
if ( file . length ( ) = = 0 ) {
// directory -> get list of files
RequestHeader requestHeader = new RequestHeader ( ) ;
if ( request . referrerhash ( ) ! = null ) {
DigestURI u = sb . getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
if ( u ! = null ) requestHeader . put ( RequestHeader . REFERER , u . toNormalform ( true , false ) ) ;
}
StringBuilder dirList = ftpClient . dirhtml ( path ) ;
2009-07-23 23:31:51 +02:00
2010-03-11 16:43:06 +01:00
if ( dirList = = null ) {
response = null ;
2008-03-15 22:57:55 +01:00
} else {
2010-03-11 16:43:06 +01:00
ResponseHeader responseHeader = new ResponseHeader ( ) ;
2010-08-23 14:32:02 +02:00
responseHeader . put ( HeaderFramework . LAST_MODIFIED , HeaderFramework . formatRFC1123 ( new Date ( ) ) ) ;
2010-03-11 16:43:06 +01:00
responseHeader . put ( HeaderFramework . CONTENT_TYPE , " text/html " ) ;
2011-02-12 01:01:40 +01:00
final CrawlProfile profile = sb . crawler . getActive ( request . profileHandle ( ) . getBytes ( ) ) ;
2010-03-11 16:43:06 +01:00
response = new Response (
request ,
requestHeader ,
responseHeader ,
" 200 " ,
2011-02-12 01:01:40 +01:00
profile ,
2010-03-11 16:43:06 +01:00
dirList . toString ( ) . getBytes ( ) ) ;
}
} else {
// file -> download
try {
response = getFile ( ftpClient , request , acceptOnlyParseable ) ;
} catch ( final Exception e ) {
// add message to errorLog
2011-05-26 18:34:35 +02:00
Log . logException ( e ) ;
2010-03-11 16:43:06 +01:00
( new PrintStream ( berr ) ) . print ( e . getMessage ( ) ) ;
2006-09-07 07:22:35 +02:00
}
2010-03-11 16:43:06 +01:00
}
2008-11-11 22:33:40 +01:00
closeConnection ( ftpClient ) ;
2008-03-15 22:57:55 +01:00
}
2006-09-07 07:22:35 +02:00
2008-03-15 22:57:55 +01:00
// pass the downloaded resource to the cache manager
2009-07-19 23:59:29 +02:00
if ( berr . size ( ) > 0 | | response = = null ) {
2008-03-15 22:57:55 +01:00
// some error logging
2010-12-02 12:05:04 +01:00
final String detail = ( berr . size ( ) > 0 ) ? " Errorlog: " + berr . toString ( ) : " " ;
2011-05-26 12:57:02 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , " ftp server download, " + detail , - 1 ) ;
2010-12-02 12:05:04 +01:00
throw new IOException ( " FTPLoader: Unable to download URL ' " + request . url ( ) . toString ( ) + " ': " + detail ) ;
2008-03-15 22:57:55 +01:00
}
2009-03-20 11:21:23 +01:00
2010-05-26 02:01:16 +02:00
Latency . update ( request . url ( ) , System . currentTimeMillis ( ) - start ) ;
2009-07-19 23:59:29 +02:00
return response ;
2008-03-15 22:57:55 +01:00
}
2008-03-14 17:28:27 +01:00
/ * *
* @param ftpClient
* /
2010-08-23 00:32:39 +02:00
private void closeConnection ( final FTPClient ftpClient ) {
2008-03-14 17:28:27 +01:00
// closing connection
ftpClient . exec ( " close " , false ) ;
ftpClient . exec ( " exit " , false ) ;
}
/ * *
* establish a connection to the ftp server ( open , login , set transfer mode )
* /
2010-08-23 00:32:39 +02:00
private boolean openConnection ( final FTPClient ftpClient , final DigestURI entryUrl ) {
2008-03-14 17:28:27 +01:00
// get username and password
final String userInfo = entryUrl . getUserInfo ( ) ;
String userName = " anonymous " , userPwd = " anonymous " ;
if ( userInfo ! = null ) {
final int pos = userInfo . indexOf ( " : " ) ;
if ( pos ! = - 1 ) {
userName = userInfo . substring ( 0 , pos ) ;
userPwd = userInfo . substring ( pos + 1 ) ;
}
}
// get server name and port
final String host = entryUrl . getHost ( ) ;
final int port = entryUrl . getPort ( ) ;
// open a connection to the ftp server
if ( port = = - 1 ) {
ftpClient . exec ( " open " + host , false ) ;
} else {
ftpClient . exec ( " open " + host + " " + port , false ) ;
2008-03-14 13:35:53 +01:00
}
2008-03-15 22:57:55 +01:00
if ( ftpClient . notConnected ( ) ) {
return false ;
}
2008-03-14 17:28:27 +01:00
// login to the server
ftpClient . exec ( " user " + userName + " " + userPwd , false ) ;
2008-03-15 22:57:55 +01:00
if ( ftpClient . isLoggedIn ( ) ) {
// change transfer mode to binary
ftpClient . exec ( " binary " , false ) ;
} else {
return false ;
}
return true ;
2008-03-14 17:28:27 +01:00
}
2010-12-02 12:05:04 +01:00
private Response getFile ( final FTPClient ftpClient , final Request request , boolean acceptOnlyParseable ) throws IOException {
2008-03-14 17:28:27 +01:00
// determine the mimetype of the resource
2010-03-11 16:43:06 +01:00
final DigestURI url = request . url ( ) ;
final String mime = TextParser . mimeOf ( url ) ;
final String path = getPath ( url ) ;
2008-03-14 17:28:27 +01:00
2010-03-11 16:43:06 +01:00
// determine the file date
final Date fileDate = ftpClient . entryDate ( path ) ;
// create response header
RequestHeader requestHeader = new RequestHeader ( ) ;
2010-12-02 12:05:04 +01:00
if ( request . referrerhash ( ) ! = null ) {
DigestURI refurl = sb . getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
if ( refurl ! = null ) requestHeader . put ( RequestHeader . REFERER , refurl . toNormalform ( true , false ) ) ;
}
2010-03-11 16:43:06 +01:00
ResponseHeader responseHeader = new ResponseHeader ( ) ;
2010-08-23 14:32:02 +02:00
responseHeader . put ( HeaderFramework . LAST_MODIFIED , HeaderFramework . formatRFC1123 ( fileDate ) ) ;
2010-03-11 16:43:06 +01:00
responseHeader . put ( HeaderFramework . CONTENT_TYPE , mime ) ;
// if the mimetype and file extension is supported we start to download the file
2010-12-11 01:31:57 +01:00
final long size = ftpClient . fileSize ( path ) ;
2010-12-28 03:15:22 +01:00
responseHeader . put ( HeaderFramework . CONTENT_LENGTH , String . valueOf ( size ) ) ;
2010-03-11 16:43:06 +01:00
String parserError = null ;
if ( ( acceptOnlyParseable & & ( parserError = TextParser . supports ( url , mime ) ) ! = null ) | |
( size > maxFileSize & & maxFileSize > = 0 ) ) {
// we know that we cannot process that file before loading
// only the metadata is returned
if ( parserError ! = null ) {
log . logInfo ( " No parser available in FTP crawler: ' " + parserError + " ' for URL " + request . url ( ) . toString ( ) + " : parsing only metadata " ) ;
2008-03-14 17:28:27 +01:00
} else {
2010-03-11 16:43:06 +01:00
log . logInfo ( " Too big file in FTP crawler with size = " + size + " Bytes for URL " + request . url ( ) . toString ( ) + " : parsing only metadata " ) ;
2008-03-14 17:28:27 +01:00
}
2010-03-11 16:43:06 +01:00
// create response with metadata only
responseHeader . put ( HeaderFramework . CONTENT_TYPE , " text/plain " ) ;
2011-02-12 01:01:40 +01:00
final CrawlProfile profile = sb . crawler . getActive ( request . profileHandle ( ) . getBytes ( ) ) ;
2010-03-11 16:43:06 +01:00
Response response = new Response (
request ,
requestHeader ,
responseHeader ,
" 200 " ,
2011-02-12 01:01:40 +01:00
profile ,
2010-12-28 03:15:22 +01:00
null ) ;
2010-03-11 16:43:06 +01:00
return response ;
2008-03-14 17:28:27 +01:00
}
2010-03-11 16:43:06 +01:00
// download the remote file
byte [ ] b = ftpClient . get ( path ) ;
// create a response
2011-02-12 01:01:40 +01:00
final CrawlProfile profile = sb . crawler . getActive ( request . profileHandle ( ) . getBytes ( ) ) ;
2010-03-11 16:43:06 +01:00
Response response = new Response (
request ,
requestHeader ,
responseHeader ,
" 200 " ,
2011-02-12 01:01:40 +01:00
profile ,
2010-03-11 16:43:06 +01:00
b ) ;
2009-07-19 23:59:29 +02:00
return response ;
2008-03-14 17:28:27 +01:00
}
2008-03-15 22:57:55 +01:00
/ * *
* gets path suitable for FTP ( url - decoded , double - quotes escaped )
*
* @param entryUrl
* @return
* /
2010-05-25 14:54:57 +02:00
private String getPath ( final MultiProtocolURI entryUrl ) {
2010-12-18 11:22:54 +01:00
return MultiProtocolURI . unescape ( entryUrl . getPath ( ) ) . replace ( " \" " , " \" \" " ) ;
2008-03-15 22:57:55 +01:00
}
2006-09-04 16:38:29 +02:00
}