2011-11-25 12:23:52 +01:00
// FTPLoader.java
2006-09-04 16:38:29 +02:00
// -------------------------------------
// part of YACY
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2006-09-04 16:38:29 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
//
// This file ist contributed by Martin Thelian
//
2011-03-08 02:51:51 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2006-09-04 16:38:29 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler.retrieval ;
2006-09-04 16:38:29 +02:00
import java.io.ByteArrayOutputStream ;
2008-11-22 01:40:18 +01:00
import java.io.IOException ;
2006-09-04 16:38:29 +02:00
import java.io.PrintStream ;
2006-09-07 06:12:52 +02:00
import java.util.Date ;
2006-09-04 16:38:29 +02:00
2012-10-09 12:14:28 +02:00
import net.yacy.cora.document.ASCII ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2012-10-09 12:14:28 +02:00
import net.yacy.cora.document.UTF8 ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2010-08-23 00:32:39 +02:00
import net.yacy.cora.protocol.ftp.FTPClient ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.Latency ;
import net.yacy.crawler.data.ZURL.FailCategory ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.TextParser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2006-09-04 16:38:29 +02:00
2008-05-06 02:32:41 +02:00
public class FTPLoader {
2006-09-04 16:38:29 +02:00
2010-12-11 01:31:57 +01:00
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
2011-11-25 12:23:52 +01:00
2009-07-19 22:37:44 +02:00
private final Switchboard sb ;
2013-07-09 14:28:25 +02:00
private final ConcurrentLog log ;
2010-12-11 01:31:57 +01:00
private final long maxFileSize ;
2008-03-14 13:35:53 +01:00
2013-07-09 14:28:25 +02:00
public FTPLoader ( final Switchboard sb , final ConcurrentLog log ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
this . log = log ;
2010-12-11 01:31:57 +01:00
this . maxFileSize = sb . getConfigLong ( " crawler.ftp.maxFileSize " , - 1l ) ;
2006-09-04 16:38:29 +02:00
}
2008-03-14 17:28:27 +01:00
/ * *
* Loads the entry from a ftp - server
2011-11-25 12:23:52 +01:00
*
2009-07-19 23:59:29 +02:00
* @param request
2008-03-14 17:28:27 +01:00
* @return
* /
2011-11-25 12:23:52 +01:00
public Response load ( final Request request , final boolean acceptOnlyParseable ) throws IOException {
2012-12-07 15:49:23 +01:00
Latency . updateBeforeLoad ( request . url ( ) ) ;
2011-11-25 12:23:52 +01:00
final long start = System . currentTimeMillis ( ) ;
2009-10-11 02:12:19 +02:00
final DigestURI entryUrl = request . url ( ) ;
2008-03-15 22:57:55 +01:00
final String fullPath = getPath ( entryUrl ) ;
// the return value
2009-07-19 23:59:29 +02:00
Response response = null ;
2008-03-14 17:28:27 +01:00
// determine filename and path
String file , path ;
if ( fullPath . endsWith ( " / " ) ) {
file = " " ;
path = fullPath ;
} else {
final int pos = fullPath . lastIndexOf ( " / " ) ;
if ( pos = = - 1 ) {
file = fullPath ;
path = " / " ;
} else {
path = fullPath . substring ( 0 , pos + 1 ) ;
file = fullPath . substring ( pos + 1 ) ;
2006-09-07 06:12:52 +02:00
}
2008-03-14 13:35:53 +01:00
}
2008-03-14 17:28:27 +01:00
assert path . endsWith ( " / " ) : " FTPLoader: path is not a path: ' " + path + " ' " ;
2008-03-14 13:35:53 +01:00
2008-03-14 17:28:27 +01:00
// stream for ftp-client errors
final ByteArrayOutputStream berr = new ByteArrayOutputStream ( ) ;
2008-03-14 13:35:53 +01:00
2010-03-11 16:43:06 +01:00
// create new ftp client
2010-08-23 00:51:31 +02:00
final FTPClient ftpClient = new FTPClient ( ) ;
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
// get a connection
2008-03-15 22:57:55 +01:00
if ( openConnection ( ftpClient , entryUrl ) ) {
2010-03-11 16:43:06 +01:00
// test if the specified file is a directory
if ( file . length ( ) > 0 ) {
ftpClient . exec ( " cd \" " + path + " \" " , false ) ;
final boolean isFolder = ftpClient . isFolder ( file ) ;
if ( isFolder ) {
path = fullPath + " / " ;
file = " " ;
2006-09-07 07:22:35 +02:00
}
2010-03-11 16:43:06 +01:00
}
2006-09-07 07:22:35 +02:00
2012-07-10 22:59:03 +02:00
if ( file . isEmpty ( ) ) {
2010-03-11 16:43:06 +01:00
// directory -> get list of files
2011-11-25 12:23:52 +01:00
final RequestHeader requestHeader = new RequestHeader ( ) ;
2010-03-11 16:43:06 +01:00
if ( request . referrerhash ( ) ! = null ) {
2012-06-28 14:27:29 +02:00
final DigestURI u = this . sb . getURL ( request . referrerhash ( ) ) ;
2012-10-10 11:46:22 +02:00
if ( u ! = null ) requestHeader . put ( RequestHeader . REFERER , u . toNormalform ( true ) ) ;
2010-03-11 16:43:06 +01:00
}
2011-11-25 12:23:52 +01:00
final StringBuilder dirList = ftpClient . dirhtml ( path ) ;
2009-07-23 23:31:51 +02:00
2010-03-11 16:43:06 +01:00
if ( dirList = = null ) {
response = null ;
2008-03-15 22:57:55 +01:00
} else {
2012-06-25 18:17:31 +02:00
final ResponseHeader responseHeader = new ResponseHeader ( 200 ) ;
2010-08-23 14:32:02 +02:00
responseHeader . put ( HeaderFramework . LAST_MODIFIED , HeaderFramework . formatRFC1123 ( new Date ( ) ) ) ;
2010-03-11 16:43:06 +01:00
responseHeader . put ( HeaderFramework . CONTENT_TYPE , " text/html " ) ;
2012-10-09 12:14:28 +02:00
final CrawlProfile profile = this . sb . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
2010-03-11 16:43:06 +01:00
response = new Response (
2011-11-25 12:23:52 +01:00
request ,
2010-03-11 16:43:06 +01:00
requestHeader ,
responseHeader ,
2011-02-12 01:01:40 +01:00
profile ,
2012-05-21 03:03:47 +02:00
false ,
2012-10-09 12:14:28 +02:00
UTF8 . getBytes ( dirList . toString ( ) ) ) ;
2010-03-11 16:43:06 +01:00
}
} else {
// file -> download
try {
response = getFile ( ftpClient , request , acceptOnlyParseable ) ;
} catch ( final Exception e ) {
// add message to errorLog
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-03-11 16:43:06 +01:00
( new PrintStream ( berr ) ) . print ( e . getMessage ( ) ) ;
2006-09-07 07:22:35 +02:00
}
2010-03-11 16:43:06 +01:00
}
2008-11-11 22:33:40 +01:00
closeConnection ( ftpClient ) ;
2008-03-15 22:57:55 +01:00
}
2006-09-07 07:22:35 +02:00
2008-03-15 22:57:55 +01:00
// pass the downloaded resource to the cache manager
2009-07-19 23:59:29 +02:00
if ( berr . size ( ) > 0 | | response = = null ) {
2008-03-15 22:57:55 +01:00
// some error logging
2010-12-02 12:05:04 +01:00
final String detail = ( berr . size ( ) > 0 ) ? " Errorlog: " + berr . toString ( ) : " " ;
2012-10-09 12:14:28 +02:00
this . sb . crawlQueues . errorURL . push ( request , ASCII . getBytes ( this . sb . peers . mySeed ( ) . hash ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , " ftp server download, " + detail , - 1 ) ;
2010-12-02 12:05:04 +01:00
throw new IOException ( " FTPLoader: Unable to download URL ' " + request . url ( ) . toString ( ) + " ': " + detail ) ;
2008-03-15 22:57:55 +01:00
}
2011-11-25 12:23:52 +01:00
2012-10-28 13:24:49 +01:00
Latency . updateAfterLoad ( request . url ( ) , System . currentTimeMillis ( ) - start ) ;
2009-07-19 23:59:29 +02:00
return response ;
2008-03-15 22:57:55 +01:00
}
2008-03-14 17:28:27 +01:00
/ * *
* @param ftpClient
* /
2010-08-23 00:32:39 +02:00
private void closeConnection ( final FTPClient ftpClient ) {
2008-03-14 17:28:27 +01:00
// closing connection
ftpClient . exec ( " close " , false ) ;
ftpClient . exec ( " exit " , false ) ;
}
/ * *
* establish a connection to the ftp server ( open , login , set transfer mode )
* /
2010-08-23 00:32:39 +02:00
private boolean openConnection ( final FTPClient ftpClient , final DigestURI entryUrl ) {
2008-03-14 17:28:27 +01:00
// get username and password
final String userInfo = entryUrl . getUserInfo ( ) ;
String userName = " anonymous " , userPwd = " anonymous " ;
if ( userInfo ! = null ) {
2011-11-25 12:23:52 +01:00
final int pos = userInfo . indexOf ( ':' , 0 ) ;
2008-03-14 17:28:27 +01:00
if ( pos ! = - 1 ) {
userName = userInfo . substring ( 0 , pos ) ;
userPwd = userInfo . substring ( pos + 1 ) ;
}
}
// get server name and port
final String host = entryUrl . getHost ( ) ;
final int port = entryUrl . getPort ( ) ;
// open a connection to the ftp server
if ( port = = - 1 ) {
ftpClient . exec ( " open " + host , false ) ;
} else {
ftpClient . exec ( " open " + host + " " + port , false ) ;
2008-03-14 13:35:53 +01:00
}
2008-03-15 22:57:55 +01:00
if ( ftpClient . notConnected ( ) ) {
return false ;
}
2008-03-14 17:28:27 +01:00
// login to the server
ftpClient . exec ( " user " + userName + " " + userPwd , false ) ;
2008-03-15 22:57:55 +01:00
if ( ftpClient . isLoggedIn ( ) ) {
// change transfer mode to binary
ftpClient . exec ( " binary " , false ) ;
} else {
return false ;
}
return true ;
2008-03-14 17:28:27 +01:00
}
2011-11-25 12:23:52 +01:00
private Response getFile ( final FTPClient ftpClient , final Request request , final boolean acceptOnlyParseable ) throws IOException {
2008-03-14 17:28:27 +01:00
// determine the mimetype of the resource
2010-03-11 16:43:06 +01:00
final DigestURI url = request . url ( ) ;
final String mime = TextParser . mimeOf ( url ) ;
final String path = getPath ( url ) ;
2008-03-14 17:28:27 +01:00
2010-03-11 16:43:06 +01:00
// determine the file date
final Date fileDate = ftpClient . entryDate ( path ) ;
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
// create response header
2011-11-25 12:23:52 +01:00
final RequestHeader requestHeader = new RequestHeader ( ) ;
2010-12-02 12:05:04 +01:00
if ( request . referrerhash ( ) ! = null ) {
2012-06-28 14:27:29 +02:00
final DigestURI refurl = this . sb . getURL ( request . referrerhash ( ) ) ;
2012-10-10 11:46:22 +02:00
if ( refurl ! = null ) requestHeader . put ( RequestHeader . REFERER , refurl . toNormalform ( true ) ) ;
2010-12-02 12:05:04 +01:00
}
2012-06-25 18:17:31 +02:00
final ResponseHeader responseHeader = new ResponseHeader ( 200 ) ;
2010-08-23 14:32:02 +02:00
responseHeader . put ( HeaderFramework . LAST_MODIFIED , HeaderFramework . formatRFC1123 ( fileDate ) ) ;
2010-03-11 16:43:06 +01:00
responseHeader . put ( HeaderFramework . CONTENT_TYPE , mime ) ;
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
// if the mimetype and file extension is supported we start to download the file
2010-12-11 01:31:57 +01:00
final long size = ftpClient . fileSize ( path ) ;
2010-12-28 03:15:22 +01:00
responseHeader . put ( HeaderFramework . CONTENT_LENGTH , String . valueOf ( size ) ) ;
2010-03-11 16:43:06 +01:00
String parserError = null ;
if ( ( acceptOnlyParseable & & ( parserError = TextParser . supports ( url , mime ) ) ! = null ) | |
2011-11-25 12:23:52 +01:00
( size > this . maxFileSize & & this . maxFileSize > = 0 ) ) {
2010-03-11 16:43:06 +01:00
// we know that we cannot process that file before loading
// only the metadata is returned
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
if ( parserError ! = null ) {
2013-07-09 14:28:25 +02:00
this . log . info ( " No parser available in FTP crawler: ' " + parserError + " ' for URL " + request . url ( ) . toString ( ) + " : parsing only metadata " ) ;
2008-03-14 17:28:27 +01:00
} else {
2013-07-09 14:28:25 +02:00
this . log . info ( " Too big file in FTP crawler with size = " + size + " Bytes for URL " + request . url ( ) . toString ( ) + " : parsing only metadata " ) ;
2008-03-14 17:28:27 +01:00
}
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
// create response with metadata only
responseHeader . put ( HeaderFramework . CONTENT_TYPE , " text/plain " ) ;
2012-10-09 12:14:28 +02:00
final CrawlProfile profile = this . sb . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
2011-11-25 12:23:52 +01:00
final Response response = new Response (
request ,
2010-03-11 16:43:06 +01:00
requestHeader ,
responseHeader ,
2011-02-12 01:01:40 +01:00
profile ,
2012-05-21 03:03:47 +02:00
false ,
2010-12-28 03:15:22 +01:00
null ) ;
2010-03-11 16:43:06 +01:00
return response ;
2008-03-14 17:28:27 +01:00
}
2011-11-25 12:23:52 +01:00
2010-03-11 16:43:06 +01:00
// download the remote file
2011-11-25 12:23:52 +01:00
final byte [ ] b = ftpClient . get ( path ) ;
2010-03-11 16:43:06 +01:00
// create a response
2012-10-09 12:14:28 +02:00
final CrawlProfile profile = this . sb . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
2011-11-25 12:23:52 +01:00
final Response response = new Response (
request ,
2010-03-11 16:43:06 +01:00
requestHeader ,
responseHeader ,
2011-02-12 01:01:40 +01:00
profile ,
2012-05-21 03:03:47 +02:00
false ,
2010-03-11 16:43:06 +01:00
b ) ;
2009-07-19 23:59:29 +02:00
return response ;
2008-03-14 17:28:27 +01:00
}
2008-03-15 22:57:55 +01:00
/ * *
* gets path suitable for FTP ( url - decoded , double - quotes escaped )
2011-11-25 12:23:52 +01:00
*
2008-03-15 22:57:55 +01:00
* @param entryUrl
* @return
* /
2010-05-25 14:54:57 +02:00
private String getPath ( final MultiProtocolURI entryUrl ) {
2010-12-18 11:22:54 +01:00
return MultiProtocolURI . unescape ( entryUrl . getPath ( ) ) . replace ( " \" " , " \" \" " ) ;
2008-03-15 22:57:55 +01:00
}
2006-09-04 16:38:29 +02:00
}