2006-03-21 21:55:59 +01:00
// httpdProxyHandler.java
2008-05-06 01:13:47 +02:00
// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2004 on http://yacy.net
2006-03-21 21:55:59 +01:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
2008-05-06 01:13:47 +02:00
// LICENSE
//
2005-04-07 21:19:42 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// Contributions:
// [AS] Alexander Schier: Blacklist (404 response for AGIS hosts)
// [TL] Timo Leise: url-wildcards for blacklists
/ *
Class documentation :
This class is a servlet to the httpd daemon . It is accessed each time
an URL in a GET , HEAD or POST command contains the whole host information
or a host is given in the header host field of an HTTP / 1 . 0 / HTTP / 1 . 1
command .
Transparency is maintained , whenever appropriate . We change header
2008-05-06 01:13:47 +02:00
attributes if necessary for the indexing mechanism ; i . e . we do not
2005-04-07 21:19:42 +02:00
support gzip - ed encoding . We also do not support unrealistic
' expires ' values that would force a cache to be flushed immediately
pragma non - cache attributes are supported
* /
package de.anomic.http ;
2005-11-22 23:07:29 +01:00
import java.io.BufferedReader ;
2008-04-08 23:17:40 +02:00
import java.io.ByteArrayInputStream ;
2008-04-05 15:17:16 +02:00
import java.io.ByteArrayOutputStream ;
2005-05-05 07:32:19 +02:00
import java.io.File ;
import java.io.IOException ;
import java.io.InputStream ;
2005-11-22 23:07:29 +01:00
import java.io.InputStreamReader ;
2005-05-05 07:32:19 +02:00
import java.io.OutputStream ;
2005-11-22 23:07:29 +01:00
import java.io.PrintWriter ;
2005-06-23 13:00:26 +02:00
import java.net.BindException ;
import java.net.ConnectException ;
2006-02-14 10:55:09 +01:00
import java.net.InetAddress ;
2005-05-05 07:32:19 +02:00
import java.net.MalformedURLException ;
2005-06-23 13:00:26 +02:00
import java.net.NoRouteToHostException ;
2005-05-05 07:32:19 +02:00
import java.net.Socket ;
2008-12-23 12:30:24 +01:00
import java.net.SocketException ;
2005-11-25 01:40:35 +01:00
import java.net.SocketTimeoutException ;
2005-06-23 13:00:26 +02:00
import java.net.UnknownHostException ;
2006-02-14 10:55:09 +01:00
import java.util.Arrays ;
2005-05-05 07:32:19 +02:00
import java.util.Date ;
import java.util.HashSet ;
2006-02-14 10:55:09 +01:00
import java.util.Iterator ;
2005-05-05 07:32:19 +02:00
import java.util.Properties ;
2005-06-09 12:22:05 +02:00
import java.util.logging.FileHandler ;
import java.util.logging.Level ;
2005-08-29 13:31:58 +02:00
import java.util.logging.LogManager ;
2005-06-09 12:22:05 +02:00
import java.util.logging.Logger ;
import java.util.zip.GZIPOutputStream ;
2006-09-18 12:12:11 +02:00
2008-07-04 13:03:03 +02:00
import de.anomic.crawler.HTTPLoader ;
2005-05-05 07:32:19 +02:00
import de.anomic.htmlFilter.htmlFilterContentTransformer ;
import de.anomic.htmlFilter.htmlFilterTransformer ;
2008-08-25 20:11:47 +02:00
import de.anomic.index.indexDocumentMetadata ;
2008-03-26 16:37:49 +01:00
import de.anomic.index.indexReferenceBlacklist ;
2005-05-05 07:32:19 +02:00
import de.anomic.plasma.plasmaHTCache ;
2005-05-17 10:25:04 +02:00
import de.anomic.plasma.plasmaParser ;
2005-05-05 07:32:19 +02:00
import de.anomic.plasma.plasmaSwitchboard ;
2008-08-02 15:57:00 +02:00
import de.anomic.plasma.plasmaSwitchboardConstants ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverCore ;
2008-12-03 16:38:29 +01:00
import de.anomic.server.serverDate ;
2007-07-24 02:46:17 +02:00
import de.anomic.server.serverDomains ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverFileUtils ;
2006-02-14 10:55:09 +01:00
import de.anomic.server.serverObjects ;
2005-06-09 12:22:05 +02:00
import de.anomic.server.logging.serverLog ;
import de.anomic.server.logging.serverMiniLogFormatter ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-04-07 21:19:42 +02:00
2007-08-09 23:58:38 +02:00
public final class httpdProxyHandler {
2005-04-07 21:19:42 +02:00
// static variables
// can only be instantiated upon first instantiation of this class object
2008-10-19 20:10:42 +02:00
private static plasmaSwitchboard sb = null ;
2008-08-06 21:43:12 +02:00
private static final HashSet < String > yellowList ;
2005-04-07 21:19:42 +02:00
private static int timeout = 30000 ;
private static boolean yacyTrigger = true ;
2005-05-09 00:36:26 +02:00
public static boolean isTransparentProxy = false ;
2008-05-03 11:06:00 +02:00
private static Process redirectorProcess = null ;
private static boolean redirectorEnabled = false ;
private static PrintWriter redirectorWriter = null ;
private static BufferedReader redirectorReader = null ;
2005-09-05 00:03:44 +02:00
2005-04-07 21:19:42 +02:00
private static htmlFilterTransformer transformer = null ;
2007-08-09 23:58:38 +02:00
private static File htRootPath = null ;
2005-09-05 00:03:44 +02:00
2007-08-09 23:58:38 +02:00
//private Properties connectionProperties = null;
2008-05-03 11:06:00 +02:00
// creating a logger
private static final serverLog theLogger = new serverLog ( " PROXY " ) ;
2007-08-09 23:58:38 +02:00
2005-06-09 12:22:05 +02:00
private static boolean doAccessLogging = false ;
2005-11-22 23:07:29 +01:00
/ * *
2005-06-09 12:22:05 +02:00
* Do logging configuration for special proxy access log file
* /
static {
2008-04-11 00:47:05 +02:00
// Doing logger initialization
2005-06-09 12:22:05 +02:00
try {
2008-07-19 17:10:00 +02:00
theLogger . logInfo ( " Configuring proxy access logging ... " ) ;
2005-08-29 13:31:58 +02:00
// getting the logging manager
2008-08-02 14:12:04 +02:00
final LogManager manager = LogManager . getLogManager ( ) ;
final String className = httpdProxyHandler . class . getName ( ) ;
2005-06-09 12:22:05 +02:00
2005-08-29 13:31:58 +02:00
// determining if proxy access logging is enabled
2008-08-02 14:12:04 +02:00
final String enabled = manager . getProperty ( " de.anomic.http.httpdProxyHandler.logging.enabled " ) ;
2005-08-29 13:31:58 +02:00
if ( " true " . equalsIgnoreCase ( enabled ) ) {
// reading out some needed configuration properties
int limit = 1024 * 1024 , count = 20 ;
String pattern = manager . getProperty ( className + " .logging.FileHandler.pattern " ) ;
2005-09-01 10:56:35 +02:00
if ( pattern = = null ) pattern = " DATA/LOG/proxyAccess%u%g.log " ;
2005-08-29 13:31:58 +02:00
2008-08-02 14:12:04 +02:00
final String limitStr = manager . getProperty ( className + " .logging.FileHandler.limit " ) ;
if ( limitStr ! = null ) try { limit = Integer . valueOf ( limitStr ) . intValue ( ) ; } catch ( final NumberFormatException e ) { }
2005-08-29 13:31:58 +02:00
2008-08-02 14:12:04 +02:00
final String countStr = manager . getProperty ( className + " .logging.FileHandler.count " ) ;
if ( countStr ! = null ) try { count = Integer . valueOf ( countStr ) . intValue ( ) ; } catch ( final NumberFormatException e ) { }
2005-08-29 13:31:58 +02:00
// creating the proxy access logger
2008-08-02 14:12:04 +02:00
final Logger proxyLogger = Logger . getLogger ( " PROXY.access " ) ;
2005-08-29 13:31:58 +02:00
proxyLogger . setUseParentHandlers ( false ) ;
2005-09-01 12:06:47 +02:00
proxyLogger . setLevel ( Level . FINEST ) ;
2008-08-02 14:12:04 +02:00
final FileHandler txtLog = new FileHandler ( pattern , limit , count , true ) ;
2005-08-29 13:31:58 +02:00
txtLog . setFormatter ( new serverMiniLogFormatter ( ) ) ;
txtLog . setLevel ( Level . FINEST ) ;
proxyLogger . addHandler ( txtLog ) ;
doAccessLogging = true ;
2008-07-19 17:10:00 +02:00
theLogger . logInfo ( " Proxy access logging configuration done. " +
2005-08-29 13:31:58 +02:00
" \ n \ tFilename: " + pattern +
" \ n \ tLimit: " + limitStr +
" \ n \ tCount: " + countStr ) ;
} else {
2008-07-19 17:10:00 +02:00
theLogger . logInfo ( " Proxy access logging is deactivated. " ) ;
2005-08-29 13:31:58 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-07-19 17:10:00 +02:00
theLogger . logSevere ( " Unable to configure proxy access logging. " , e ) ;
2007-08-09 23:58:38 +02:00
}
2008-10-19 20:10:42 +02:00
sb = plasmaSwitchboard . getSwitchboard ( ) ;
if ( sb ! = null ) {
2005-04-07 21:19:42 +02:00
2008-10-19 20:10:42 +02:00
isTransparentProxy = Boolean . valueOf ( sb . getConfig ( " isTransparentProxy " , " false " ) ) . booleanValue ( ) ;
2005-06-09 12:22:05 +02:00
2007-08-09 23:58:38 +02:00
// set timeout
2008-10-19 20:10:42 +02:00
timeout = Integer . parseInt ( sb . getConfig ( " proxy.clientTimeout " , " 10000 " ) ) ;
2007-08-09 23:58:38 +02:00
// create a htRootPath: system pages
2008-10-19 20:10:42 +02:00
htRootPath = new File ( sb . getRootPath ( ) , sb . getConfig ( " htRootPath " , " htroot " ) ) ;
2008-08-26 18:34:24 +02:00
if ( ! ( htRootPath . exists ( ) ) ) {
if ( ! htRootPath . mkdir ( ) )
serverLog . logSevere ( " PROXY " , " could not create htRoot " + htRootPath ) ;
}
2007-08-09 23:58:38 +02:00
// load a transformer
transformer = new htmlFilterContentTransformer ( ) ;
2008-10-19 20:10:42 +02:00
transformer . init ( new File ( sb . getRootPath ( ) , sb . getConfig ( plasmaSwitchboardConstants . LIST_BLUE , " " ) ) . toString ( ) ) ;
2007-08-09 23:58:38 +02:00
// load the yellow-list
2008-10-19 20:10:42 +02:00
final String f = sb . getConfig ( " proxyYellowList " , null ) ;
2007-08-09 23:58:38 +02:00
if ( f ! = null ) {
yellowList = serverFileUtils . loadList ( new File ( f ) ) ;
theLogger . logConfig ( " loaded yellow-list from file " + f + " , " + yellowList . size ( ) + " entries " ) ;
} else {
2008-01-28 19:21:08 +01:00
yellowList = new HashSet < String > ( ) ;
2005-06-09 12:22:05 +02:00
}
2007-08-09 23:58:38 +02:00
2008-10-19 20:10:42 +02:00
final String redirectorPath = sb . getConfig ( " externalRedirector " , " " ) ;
2007-08-09 23:58:38 +02:00
if ( redirectorPath . length ( ) > 0 & & redirectorEnabled = = false ) {
2005-11-22 23:07:29 +01:00
try {
redirectorProcess = Runtime . getRuntime ( ) . exec ( redirectorPath ) ;
redirectorWriter = new PrintWriter ( redirectorProcess . getOutputStream ( ) ) ;
redirectorReader = new BufferedReader ( new InputStreamReader ( redirectorProcess . getInputStream ( ) ) ) ;
redirectorEnabled = true ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2005-11-22 23:07:29 +01:00
System . out . println ( " redirector not Found " ) ;
}
}
2008-08-06 21:43:12 +02:00
} else {
yellowList = null ;
2008-04-10 00:59:17 +02:00
}
2005-04-07 21:19:42 +02:00
}
2005-06-09 12:22:05 +02:00
2007-08-09 23:58:38 +02:00
/ * *
* Special logger instance for proxy access logging much similar
* to the squid access . log file
* /
private static final serverLog proxyLog = new serverLog ( " PROXY.access " ) ;
/ * *
2008-04-12 13:39:48 +02:00
* Reusable { @link StringBuilder } for logging
2007-08-09 23:58:38 +02:00
* /
2008-04-12 13:39:48 +02:00
private static final StringBuilder logMessage = new StringBuilder ( ) ;
2007-08-09 23:58:38 +02:00
/ * *
2008-12-04 13:54:16 +01:00
* Reusable { @link StringBuilder } to generate the useragent string
2007-08-09 23:58:38 +02:00
* /
2008-12-04 13:54:16 +01:00
private static final StringBuilder userAgentStr = new StringBuilder ( ) ;
2007-08-09 23:58:38 +02:00
2008-08-25 20:11:47 +02:00
public static void handleOutgoingCookies ( final httpRequestHeader requestHeader , final String targethost , final String clienthost ) {
2005-04-07 21:19:42 +02:00
/ *
2005-06-09 12:22:05 +02:00
The syntax for the header is :
cookie = " Cookie: " cookie - version
1 * ( ( " ; " | " , " ) cookie - value )
cookie - value = NAME " = " VALUE [ " ; " path ] [ " ; " domain ]
cookie - version = " $Version " " = " value
NAME = attr
VALUE = value
path = " $Path " " = " value
domain = " $Domain " " = " value
* /
2008-10-19 20:10:42 +02:00
if ( sb . getConfigBool ( " proxy.monitorCookies " , false ) ) {
2008-08-25 20:11:47 +02:00
if ( requestHeader . containsKey ( httpRequestHeader . COOKIE ) ) {
final Object [ ] entry = new Object [ ] { new Date ( ) , clienthost , requestHeader . getMultiple ( httpRequestHeader . COOKIE ) } ;
2008-10-19 20:10:42 +02:00
synchronized ( sb . outgoingCookies ) {
sb . outgoingCookies . put ( targethost , entry ) ;
2008-07-30 23:19:06 +02:00
}
2008-04-12 13:39:48 +02:00
}
2005-04-07 21:19:42 +02:00
}
}
2008-08-25 20:11:47 +02:00
public static void handleIncomingCookies ( final httpResponseHeader respondHeader , final String serverhost , final String targetclient ) {
2005-04-07 21:19:42 +02:00
/ *
2005-06-09 12:22:05 +02:00
The syntax for the Set - Cookie response header is
set - cookie = " Set-Cookie: " cookies
cookies = 1 # cookie
cookie = NAME " = " VALUE * ( " ; " cookie - av )
NAME = attr
VALUE = value
cookie - av = " Comment " " = " value
| " Domain " " = " value
| " Max-Age " " = " value
| " Path " " = " value
| " Secure "
| " Version " " = " 1 * DIGIT
* /
2008-10-19 20:10:42 +02:00
if ( sb . getConfigBool ( " proxy.monitorCookies " , false ) ) {
2008-08-25 20:11:47 +02:00
if ( respondHeader . containsKey ( httpResponseHeader . SET_COOKIE ) ) {
final Object [ ] entry = new Object [ ] { new Date ( ) , targetclient , respondHeader . getMultiple ( httpResponseHeader . SET_COOKIE ) } ;
2008-10-19 20:10:42 +02:00
synchronized ( sb . incomingCookies ) {
sb . incomingCookies . put ( serverhost , entry ) ;
2008-07-30 23:19:06 +02:00
}
2008-04-12 13:39:48 +02:00
}
2005-04-07 21:19:42 +02:00
}
}
2005-06-09 12:22:05 +02:00
/ * *
* @param conProp a collection of properties about the connection , like URL
* @param requestHeader The header lines of the connection from the request
* @param respond the OutputStream to the client
* @see de . anomic . http . httpdHandler # doGet ( java . util . Properties , de . anomic . http . httpHeader , java . io . OutputStream )
* /
2008-08-25 20:11:47 +02:00
public static void doGet ( final Properties conProp , final httpRequestHeader requestHeader , final OutputStream respond ) {
2008-07-19 17:10:00 +02:00
httpdByteCountOutputStream countedRespond = null ;
2005-06-09 12:22:05 +02:00
try {
2008-07-19 17:10:00 +02:00
final int reqID = requestHeader . hashCode ( ) ;
2005-06-09 12:22:05 +02:00
// remembering the starting time of the request
2005-11-23 13:07:07 +01:00
final Date requestDate = new Date ( ) ; // remember the time...
2008-08-06 21:43:12 +02:00
conProp . put ( httpHeader . CONNECTION_PROP_REQUEST_START , Long . valueOf ( requestDate . getTime ( ) ) ) ;
2005-06-09 12:22:05 +02:00
if ( yacyTrigger ) de . anomic . yacy . yacyCore . triggerOnlineAction ( ) ;
2008-10-19 20:10:42 +02:00
sb . proxyLastAccess = System . currentTimeMillis ( ) ;
2006-03-21 21:55:59 +01:00
2005-06-09 12:22:05 +02:00
// using an ByteCount OutputStream to count the send bytes (needed for the logfile)
2008-07-19 17:10:00 +02:00
countedRespond = new httpdByteCountOutputStream ( respond , conProp . getProperty ( httpHeader . CONNECTION_PROP_REQUESTLINE ) . length ( ) + 2 , " PROXY " ) ;
2006-03-21 21:55:59 +01:00
String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
String path = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH ) ; // always starts with leading '/'
final String args = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ; // may be null if no args were given
final String ip = conProp . getProperty ( httpHeader . CONNECTION_PROP_CLIENTIP ) ; // the ip from the connecting peer
2005-11-23 15:05:25 +01:00
int pos = 0 ;
int port = 0 ;
2006-03-21 21:55:59 +01:00
2007-09-05 11:01:35 +02:00
yacyURL url = null ;
2005-04-14 01:00:20 +02:00
try {
2005-09-20 23:49:47 +02:00
url = httpHeader . getRequestURL ( conProp ) ;
2008-09-03 02:30:21 +02:00
if ( theLogger . isFine ( ) ) theLogger . logFine ( reqID + " GET " + url ) ;
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " header: " + requestHeader ) ;
2006-03-21 21:55:59 +01:00
2005-11-22 23:07:29 +01:00
//redirector
if ( redirectorEnabled ) {
2005-11-23 08:37:15 +01:00
synchronized ( redirectorProcess ) {
2007-07-19 17:32:10 +02:00
redirectorWriter . println ( url . toNormalform ( false , true ) ) ;
2005-11-23 08:37:15 +01:00
redirectorWriter . flush ( ) ;
}
2008-08-02 14:12:04 +02:00
final String newUrl = redirectorReader . readLine ( ) ;
2007-09-05 11:01:35 +02:00
if ( ! newUrl . equals ( " " ) ) {
try {
url = new yacyURL ( newUrl , null ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) { } //just keep the old one
2006-01-04 18:40:18 +01:00
}
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " using redirector to " + url ) ;
2005-11-23 08:37:15 +01:00
conProp . setProperty ( httpHeader . CONNECTION_PROP_HOST , url . getHost ( ) + " : " + url . getPort ( ) ) ;
2005-11-22 23:07:29 +01:00
conProp . setProperty ( httpHeader . CONNECTION_PROP_PATH , url . getPath ( ) ) ;
2005-11-23 08:37:15 +01:00
requestHeader . put ( httpHeader . HOST , url . getHost ( ) + " : " + url . getPort ( ) ) ;
2005-11-22 23:07:29 +01:00
requestHeader . put ( httpHeader . CONNECTION_PROP_PATH , url . getPath ( ) ) ;
}
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
final String errorMsg = " ERROR: internal error with url generation: host= " +
2005-06-09 12:22:05 +02:00
host + " , port= " + port + " , path= " + path + " , args= " + args ;
2008-07-19 17:10:00 +02:00
theLogger . logSevere ( errorMsg ) ;
httpd . sendRespondError ( conProp , countedRespond , 4 , 501 , null , errorMsg , e ) ;
2005-06-09 12:22:05 +02:00
return ;
}
2005-11-23 15:05:25 +01:00
if ( ( pos = host . indexOf ( " : " ) ) < 0 ) {
port = 80 ;
} else {
port = Integer . parseInt ( host . substring ( pos + 1 ) ) ;
host = host . substring ( 0 , pos ) ;
}
2006-03-21 21:55:59 +01:00
2005-11-23 08:52:36 +01:00
String ext ;
if ( ( pos = path . lastIndexOf ( '.' ) ) < 0 ) {
ext = " " ;
} else {
ext = path . substring ( pos + 1 ) . toLowerCase ( ) ;
}
2005-11-23 13:07:07 +01:00
2005-06-09 12:22:05 +02:00
// check the blacklist
// blacklist idea inspired by [AS]:
// respond a 404 for all AGIS ("all you get is shit") servers
2008-08-02 14:12:04 +02:00
final String hostlow = host . toLowerCase ( ) ;
2006-03-21 21:55:59 +01:00
if ( args ! = null ) { path = path + " ? " + args ; }
2008-03-26 16:37:49 +01:00
if ( plasmaSwitchboard . urlBlacklist . isListed ( indexReferenceBlacklist . BLACKLIST_PROXY , hostlow , path ) ) {
2007-08-09 23:58:38 +02:00
theLogger . logInfo ( " AGIS blocking of host ' " + hostlow + " ' " ) ;
2008-07-19 17:10:00 +02:00
httpd . sendRespondError ( conProp , countedRespond , 4 , 403 , null ,
" URL ' " + hostlow + " ' blocked by yacy proxy (blacklisted) " , null ) ;
2005-06-09 12:22:05 +02:00
return ;
}
2006-03-21 21:55:59 +01:00
2005-06-09 12:22:05 +02:00
// handle outgoing cookies
handleOutgoingCookies ( requestHeader , host , ip ) ;
2006-03-21 21:55:59 +01:00
2008-07-19 17:10:00 +02:00
prepareRequestHeader ( conProp , requestHeader , hostlow ) ;
2005-09-05 10:01:54 +02:00
2008-08-25 20:11:47 +02:00
httpResponseHeader cachedResponseHeader = plasmaHTCache . loadResponseHeader ( url ) ;
2005-06-09 12:22:05 +02:00
// why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises
2008-04-11 00:47:05 +02:00
// (a) no file shall be unzip-ed more than once to prevent unnecessary computing time
// (b) old cache entries shall be comparable with refill-entries to detect/distinguish case 3+4
2005-06-09 12:22:05 +02:00
// (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
// case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
// and the newly arrival would be zipped and would have to be unzipped upon load. But then the
// scheduler is superfluous. Therefore the only reminding case is
// (d) cached files shall be either all zipped or unzipped
// case d contradicts with a, because files need to be unzipped for indexing. Therefore
// the only remaining case is to unzip files right upon load. Thats what we do here.
// finally use existing cache if appropriate
// here we must decide weather or not to save the data
// to a cache
// we distinguish four CACHE STATE cases:
// 1. cache fill
// 2. cache fresh - no refill
// 3. cache stale - refill - necessary
// 4. cache stale - refill - superfluous
// in two of these cases we trigger a scheduler to handle newly arrived files:
// case 1 and case 3
2008-09-16 23:56:23 +02:00
if ( cachedResponseHeader = = null ) {
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " page not in cache: fulfill request from web " ) ;
2008-10-16 23:24:09 +02:00
fulfillRequestFromWeb ( conProp , url , ext , requestHeader , cachedResponseHeader , countedRespond ) ;
2008-09-16 23:56:23 +02:00
} else {
final indexDocumentMetadata cacheEntry = new httpdProxyCacheEntry (
0 , // crawling depth
url , // url
" " , // name of the url is unknown
//requestHeader, // request headers
" 200 OK " , // request status
requestHeader ,
cachedResponseHeader ,
null , // initiator
2008-10-19 20:10:42 +02:00
sb . webIndex . defaultProxyProfile // profile
2008-09-16 23:56:23 +02:00
) ;
plasmaHTCache . storeMetadata ( cachedResponseHeader , cacheEntry ) ; // TODO: check if this storeMetadata is necessary
2008-10-16 23:24:09 +02:00
byte [ ] cacheContent = plasmaHTCache . getResourceContent ( url ) ;
if ( cacheContent ! = null & & cacheEntry . shallUseCacheForProxy ( ) ) {
2008-09-16 23:56:23 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " fulfill request from cache " ) ;
2008-10-16 23:24:09 +02:00
fulfillRequestFromCache ( conProp , url , ext , requestHeader , cachedResponseHeader , cacheContent , countedRespond ) ;
} else {
2008-09-16 23:56:23 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " fulfill request from web " ) ;
2008-10-16 23:24:09 +02:00
fulfillRequestFromWeb ( conProp , url , ext , requestHeader , cachedResponseHeader , countedRespond ) ;
2008-09-16 23:56:23 +02:00
}
2005-06-09 12:22:05 +02:00
}
2008-09-16 23:56:23 +02:00
2005-06-09 12:22:05 +02:00
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-06-23 13:00:26 +02:00
try {
2008-08-02 14:12:04 +02:00
final String exTxt = e . getMessage ( ) ;
2005-06-23 13:00:26 +02:00
if ( ( exTxt ! = null ) & & ( exTxt . startsWith ( " Socket closed " ) ) ) {
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-09-20 23:49:47 +02:00
} else if ( ! conProp . containsKey ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_HEADER ) ) {
2008-08-02 14:12:04 +02:00
final String errorMsg = " Unexpected Error. " + e . getClass ( ) . getName ( ) + " : " + e . getMessage ( ) ;
2008-07-19 17:10:00 +02:00
httpd . sendRespondError ( conProp , countedRespond , 4 , 501 , null , errorMsg , e ) ;
2007-08-09 23:58:38 +02:00
theLogger . logSevere ( errorMsg ) ;
2005-06-23 13:00:26 +02:00
} else {
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-06-23 13:00:26 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception ee ) {
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-06-23 13:00:26 +02:00
}
2005-06-09 12:22:05 +02:00
} finally {
2008-08-02 14:12:04 +02:00
try { if ( countedRespond ! = null ) countedRespond . flush ( ) ; else if ( respond ! = null ) respond . flush ( ) ; } catch ( final Exception e ) { }
2008-07-19 17:10:00 +02:00
if ( countedRespond ! = null ) countedRespond . finish ( ) ;
2005-06-09 12:22:05 +02:00
2008-08-06 21:43:12 +02:00
conProp . put ( httpHeader . CONNECTION_PROP_REQUEST_END , Long . valueOf ( System . currentTimeMillis ( ) ) ) ;
conProp . put ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_SIZE , ( countedRespond ! = null ) ? Long . valueOf ( countedRespond . getCount ( ) ) : - 1L ) ;
2007-08-09 23:58:38 +02:00
logProxyAccess ( conProp ) ;
2005-04-14 01:00:20 +02:00
}
2005-06-09 12:22:05 +02:00
}
2008-10-16 23:24:09 +02:00
private static void fulfillRequestFromWeb ( final Properties conProp , final yacyURL url , final String ext , final httpRequestHeader requestHeader , final httpResponseHeader cachedResponseHeader , final OutputStream respond ) {
2005-04-14 01:00:20 +02:00
2008-08-02 14:12:04 +02:00
final GZIPOutputStream gzippedOut = null ;
2008-12-23 20:14:54 +01:00
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpResponse res = null ;
2005-04-14 01:00:20 +02:00
try {
2008-07-19 17:10:00 +02:00
final int reqID = requestHeader . hashCode ( ) ;
2005-06-09 12:22:05 +02:00
2005-09-20 23:49:47 +02:00
String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
2008-08-02 14:12:04 +02:00
final String path = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH ) ; // always starts with leading '/'
final String args = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ; // may be null if no args were given
final String ip = conProp . getProperty ( httpHeader . CONNECTION_PROP_CLIENTIP ) ; // the ip from the connecting peer
final String httpVer = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ; // the ip from the connecting peer
2005-06-09 12:22:05 +02:00
int port , pos ;
if ( ( pos = host . indexOf ( " : " ) ) < 0 ) {
port = 80 ;
2005-04-14 01:00:20 +02:00
} else {
2005-06-09 12:22:05 +02:00
port = Integer . parseInt ( host . substring ( pos + 1 ) ) ;
host = host . substring ( 0 , pos ) ;
}
// resolve yacy and yacyh domains
2008-10-27 23:16:56 +01:00
String yAddress = resolveYacyDomains ( host ) ;
2005-06-09 12:22:05 +02:00
// re-calc the url path
String remotePath = ( args = = null ) ? path : ( path + " ? " + args ) ; // with leading '/'
2008-10-27 23:16:56 +01:00
// remove yacy-subdomain-path, when accessing /env
if ( ( yAddress ! = null )
& & ( remotePath . startsWith ( " /env " ) )
& & ( ( pos = yAddress . indexOf ( '/' ) ) ! = - 1 )
) yAddress = yAddress . substring ( 0 , yAddress . indexOf ( '/' ) ) ;
2005-06-09 12:22:05 +02:00
2008-07-19 17:10:00 +02:00
modifyProxyHeaders ( requestHeader , httpVer ) ;
2006-02-16 10:20:57 +01:00
2008-04-08 23:17:40 +02:00
final String connectHost = hostPart ( host , port , yAddress ) ;
final String getUrl = " http:// " + connectHost + remotePath ;
2008-07-19 17:10:00 +02:00
final JakartaCommonsHttpClient client = setupHttpClient ( requestHeader , connectHost ) ;
2008-04-08 23:17:40 +02:00
2005-04-14 01:00:20 +02:00
// send request
2008-04-05 15:17:16 +02:00
try {
2009-01-03 09:24:08 +01:00
res = client . GET ( getUrl ) ;
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " response status: " + res . getStatusLine ( ) ) ;
conProp . put ( httpHeader . CONNECTION_PROP_CLIENT_REQUEST_HEADER , requestHeader ) ;
2005-09-01 13:18:41 +02:00
2009-01-03 09:24:08 +01:00
final httpResponseHeader responseHeader = res . getResponseHeader ( ) ;
// determine if it's an internal error of the httpc
if ( responseHeader . size ( ) = = 0 ) {
throw new Exception ( res . getStatusLine ( ) ) ;
2008-10-16 23:24:09 +02:00
}
2009-01-03 09:24:08 +01:00
final httpChunkedOutputStream chunkedOut = setTransferEncoding ( conProp , responseHeader , res . getStatusCode ( ) , respond ) ;
// the cache does either not exist or is (supposed to be) stale
long sizeBeforeDelete = - 1 ;
if ( cachedResponseHeader ! = null ) {
// delete the cache
sizeBeforeDelete = plasmaHTCache . getResourceContentLength ( url ) ;
plasmaHTCache . deleteFromCache ( url ) ;
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_REFRESH_MISS " ) ;
2005-04-14 01:00:20 +02:00
}
2006-03-21 21:55:59 +01:00
2009-01-03 09:24:08 +01:00
// reserver cache entry
final indexDocumentMetadata cacheEntry = new httpdProxyCacheEntry (
0 ,
url ,
" " ,
res . getStatusLine ( ) ,
requestHeader ,
responseHeader ,
null ,
sb . webIndex . defaultProxyProfile
) ;
plasmaHTCache . storeMetadata ( responseHeader , cacheEntry ) ;
2008-12-23 20:14:54 +01:00
2009-01-03 09:24:08 +01:00
// handle incoming cookies
handleIncomingCookies ( responseHeader , host , ip ) ;
prepareResponseHeader ( responseHeader , res . getHttpVer ( ) ) ;
// sending the respond header back to the client
if ( chunkedOut ! = null ) {
responseHeader . put ( httpResponseHeader . TRANSFER_ENCODING , " chunked " ) ;
}
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " sending response header: " + responseHeader ) ;
httpd . sendRespondHeader (
conProp ,
respond ,
httpVer ,
res . getStatusCode ( ) ,
res . getStatusLine ( ) . substring ( 4 ) , // status text
responseHeader ) ;
if ( hasBody ( res . getStatusCode ( ) ) ) {
final OutputStream outStream = ( gzippedOut ! = null ) ? gzippedOut : ( ( chunkedOut ! = null ) ? chunkedOut : respond ) ;
final String storeError = cacheEntry . shallStoreCacheForProxy ( ) ;
final boolean storeHTCache = cacheEntry . profile ( ) . storeHTCache ( ) ;
final boolean isSupportedContent = plasmaParser . supportedContent ( plasmaParser . PARSER_MODE_PROXY , cacheEntry . url ( ) , cacheEntry . getMimeType ( ) ) ;
if (
/ *
* Now we store the response into the htcache directory if
* a ) the response is cacheable AND
* /
( storeError = = null ) & &
/ *
* b ) the user has configured to use the htcache OR
* c ) the content should be indexed
* /
( ( storeHTCache ) | | ( isSupportedContent ) )
) {
// we don't write actually into a file, only to RAM, and schedule writing the file.
int l = res . getResponseHeader ( ) . size ( ) ;
final ByteArrayOutputStream byteStream = new ByteArrayOutputStream ( ( l < 32 ) ? 32 : l ) ;
final OutputStream toClientAndMemory = new MultiOutputStream ( new OutputStream [ ] { outStream , byteStream } ) ;
serverFileUtils . copy ( res . getDataAsStream ( ) , toClientAndMemory ) ;
// cached bytes
byte [ ] cacheArray ;
if ( byteStream . size ( ) > 0 ) {
cacheArray = byteStream . toByteArray ( ) ;
} else {
cacheArray = null ;
}
if ( theLogger . isFine ( ) ) theLogger . logFine ( reqID + " writeContent of " + url + " produced cacheArray = " + ( ( cacheArray = = null ) ? " null " : ( " size= " + cacheArray . length ) ) ) ;
if ( sizeBeforeDelete = = - 1 ) {
// totally fresh file
//cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheEntry . setCacheArray ( cacheArray ) ;
sb . htEntryStoreProcess ( cacheEntry ) ;
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_MISS " ) ;
} else if ( cacheArray ! = null & & sizeBeforeDelete = = cacheArray . length ) {
// before we came here we deleted a cache entry
cacheArray = null ;
//cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
//cacheManager.push(cacheEntry); // unnecessary update
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_REF_FAIL_HIT " ) ;
} else {
// before we came here we deleted a cache entry
//cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheEntry . setCacheArray ( cacheArray ) ;
sb . htEntryStoreProcess ( cacheEntry ) ;
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_REFRESH_MISS " ) ;
}
} else {
// no caching
if ( theLogger . isFine ( ) ) theLogger . logFine ( reqID + " " + url . toString ( ) + " not cached. " +
" StoreError= " + ( ( storeError = = null ) ? " None " : storeError ) +
" StoreHTCache= " + storeHTCache +
" SupportetContent= " + isSupportedContent ) ;
serverFileUtils . copy ( res . getDataAsStream ( ) , outStream ) ;
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_MISS " ) ;
}
if ( gzippedOut ! = null ) {
gzippedOut . finish ( ) ;
}
if ( chunkedOut ! = null ) {
chunkedOut . finish ( ) ;
chunkedOut . flush ( ) ;
}
} // end hasBody
2008-12-23 12:30:24 +01:00
} catch ( SocketException se ) {
2009-01-03 09:24:08 +01:00
// if opened ...
if ( res ! = null ) {
// client cut proxy connection, abort download
res . abort ( ) ;
}
handleProxyException ( se , conProp , respond , url ) ;
2008-04-05 15:17:16 +02:00
} finally {
// if opened ...
if ( res ! = null ) {
// ... close connection
res . closeStream ( ) ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-04-05 15:17:16 +02:00
handleProxyException ( e , conProp , respond , url ) ;
2007-09-23 22:49:52 +02:00
}
2005-04-07 21:19:42 +02:00
}
2008-09-10 13:06:22 +02:00
/ * *
* determines if the response should have a body
*
* @param statusCode
* @param responseHeader
* @return
* /
private static boolean hasBody ( final int statusCode ) {
// "All 1xx (informational), 204 (no content), and 304 (not modified) responses MUST NOT
// include a message-body."
// [RFC 2616 HTTP/1.1, Sect. 4.3] and like [RFC 1945 HTTP/1.0, Sect. 7.2]
if ( ( statusCode > = 100 & & statusCode < 200 ) | | statusCode = = 204 | | statusCode = = 304 ) {
return false ;
}
return true ;
}
2007-08-09 23:58:38 +02:00
private static void fulfillRequestFromCache (
2008-08-02 14:12:04 +02:00
final Properties conProp ,
final yacyURL url ,
final String ext ,
2008-08-25 20:11:47 +02:00
final httpRequestHeader requestHeader ,
final httpResponseHeader cachedResponseHeader ,
2008-10-16 23:24:09 +02:00
final byte [ ] cacheEntry ,
2008-08-02 14:12:04 +02:00
final OutputStream respond
2005-06-09 12:22:05 +02:00
) throws IOException {
2008-08-02 14:12:04 +02:00
final String httpVer = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ;
2005-06-09 12:22:05 +02:00
2008-08-02 14:12:04 +02:00
final httpChunkedOutputStream chunkedOut = null ;
final GZIPOutputStream gzippedOut = null ;
2005-06-09 12:22:05 +02:00
// we respond on the request by using the cache, the cache is fresh
2005-04-07 21:19:42 +02:00
try {
2008-12-23 00:04:00 +01:00
prepareResponseHeader ( cachedResponseHeader , httpVer ) ;
2005-06-09 12:22:05 +02:00
// replace date field in old header by actual date, this is according to RFC
2008-12-03 16:38:29 +01:00
cachedResponseHeader . put ( httpHeader . DATE , serverDate . formatRFC1123 ( new Date ( ) ) ) ;
2005-04-07 21:19:42 +02:00
2005-06-09 12:22:05 +02:00
// if (((String)requestHeader.get(httpHeader.ACCEPT_ENCODING,"")).indexOf("gzip") != -1) {
// chunked = new httpChunkedOutputStream(respond);
// zipped = new GZIPOutputStream(chunked);
// cachedResponseHeader.put(httpHeader.TRANSFER_ENCODING, "chunked");
// cachedResponseHeader.put(httpHeader.CONTENT_ENCODING, "gzip");
// } else {
// maybe the content length is missing
// if (!(cachedResponseHeader.containsKey(httpHeader.CONTENT_LENGTH)))
// cachedResponseHeader.put(httpHeader.CONTENT_LENGTH, Long.toString(cacheFile.length()));
// }
2005-04-07 21:19:42 +02:00
2005-06-09 12:22:05 +02:00
// check if we can send a 304 instead the complete content
2008-08-25 20:11:47 +02:00
if ( requestHeader . containsKey ( httpRequestHeader . IF_MODIFIED_SINCE ) ) {
2005-06-09 12:22:05 +02:00
// conditional request: freshness of cache for that condition was already
// checked within shallUseCache(). Now send only a 304 response
2008-10-16 23:24:09 +02:00
theLogger . logInfo ( " CACHE HIT/304 " + url . toString ( ) ) ;
2005-09-20 23:49:47 +02:00
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_REFRESH_HIT " ) ;
2005-06-09 12:22:05 +02:00
// setting the content length header to 0
2008-08-25 20:11:47 +02:00
cachedResponseHeader . put ( httpResponseHeader . CONTENT_LENGTH , Integer . toString ( 0 ) ) ;
2005-06-09 12:22:05 +02:00
// send cached header with replaced date and added length
httpd . sendRespondHeader ( conProp , respond , httpVer , 304 , cachedResponseHeader ) ;
//respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
} else {
// unconditional request: send content of cache
2008-10-16 23:24:09 +02:00
theLogger . logInfo ( " CACHE HIT/203 " + url . toString ( ) ) ;
2005-09-20 23:49:47 +02:00
conProp . setProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " TCP_HIT " ) ;
2005-06-09 12:22:05 +02:00
// setting the content header to the proper length
2008-10-16 23:24:09 +02:00
cachedResponseHeader . put ( httpResponseHeader . CONTENT_LENGTH , Long . toString ( cacheEntry . length ) ) ;
2005-06-09 12:22:05 +02:00
// send cached header with replaced date and added length
httpd . sendRespondHeader ( conProp , respond , httpVer , 203 , cachedResponseHeader ) ;
//respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
2008-04-12 13:39:48 +02:00
final OutputStream outStream = ( gzippedOut ! = null ) ? gzippedOut : ( ( chunkedOut ! = null ) ? chunkedOut : respond ) ;
2008-12-23 20:14:54 +01:00
2005-06-09 12:22:05 +02:00
// send also the complete body now from the cache
// simply read the file and transfer to out socket
2008-10-16 23:24:09 +02:00
serverFileUtils . copy ( cacheEntry , outStream ) ;
2005-06-09 12:22:05 +02:00
if ( gzippedOut ! = null ) gzippedOut . finish ( ) ;
if ( chunkedOut ! = null ) chunkedOut . finish ( ) ;
}
// that's it!
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-06-09 12:22:05 +02:00
// this happens if the client stops loading the file
// we do nothing here
2005-09-20 23:49:47 +02:00
if ( conProp . containsKey ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_HEADER ) ) {
2007-08-09 23:58:38 +02:00
theLogger . logWarning ( " Error while trying to send cached message body. " ) ;
2005-09-20 23:49:47 +02:00
conProp . setProperty ( httpHeader . CONNECTION_PROP_PERSISTENT , " close " ) ;
2005-06-09 12:22:05 +02:00
} else {
httpd . sendRespondError ( conProp , respond , 4 , 503 , " socket error: " + e . getMessage ( ) , " socket error: " + e . getMessage ( ) , e ) ;
}
2005-05-08 09:24:33 +02:00
} finally {
2008-08-02 14:12:04 +02:00
try { respond . flush ( ) ; } catch ( final Exception e ) { }
2005-04-07 21:19:42 +02:00
}
2005-06-09 12:22:05 +02:00
return ;
2005-04-07 21:19:42 +02:00
}
2008-08-25 20:11:47 +02:00
public static void doHead ( final Properties conProp , final httpRequestHeader requestHeader , OutputStream respond ) {
2005-06-09 12:22:05 +02:00
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpResponse res = null ;
2007-09-05 11:01:35 +02:00
yacyURL url = null ;
2005-06-09 12:22:05 +02:00
try {
2008-07-19 17:10:00 +02:00
final int reqID = requestHeader . hashCode ( ) ;
2005-09-05 12:10:00 +02:00
// remembering the starting time of the request
2008-08-02 14:12:04 +02:00
final Date requestDate = new Date ( ) ; // remember the time...
2008-08-06 21:43:12 +02:00
conProp . put ( httpHeader . CONNECTION_PROP_REQUEST_START , Long . valueOf ( requestDate . getTime ( ) ) ) ;
2005-09-05 12:10:00 +02:00
if ( yacyTrigger ) de . anomic . yacy . yacyCore . triggerOnlineAction ( ) ;
2008-10-19 20:10:42 +02:00
sb . proxyLastAccess = System . currentTimeMillis ( ) ;
2005-09-05 12:10:00 +02:00
// using an ByteCount OutputStream to count the send bytes
2007-03-16 14:52:48 +01:00
respond = new httpdByteCountOutputStream ( respond , conProp . getProperty ( httpHeader . CONNECTION_PROP_REQUESTLINE ) . length ( ) + 2 , " PROXY " ) ;
2005-09-05 12:10:00 +02:00
2005-09-20 23:49:47 +02:00
String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
2008-08-02 14:12:04 +02:00
final String path = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH ) ;
final String args = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ;
final String httpVer = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ;
2005-09-05 12:10:00 +02:00
int port , pos ;
if ( ( pos = host . indexOf ( " : " ) ) < 0 ) {
port = 80 ;
} else {
port = Integer . parseInt ( host . substring ( pos + 1 ) ) ;
host = host . substring ( 0 , pos ) ;
}
try {
2007-09-05 11:01:35 +02:00
url = new yacyURL ( " http " , host , port , ( args = = null ) ? path : path + " ? " + args ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
final String errorMsg = " ERROR: internal error with url generation: host= " +
2005-09-05 12:10:00 +02:00
host + " , port= " + port + " , path= " + path + " , args= " + args ;
2008-07-19 17:10:00 +02:00
theLogger . logSevere ( errorMsg ) ;
2005-09-05 12:10:00 +02:00
httpd . sendRespondError ( conProp , respond , 4 , 501 , null , errorMsg , e ) ;
return ;
}
2008-09-03 02:30:21 +02:00
if ( theLogger . isFine ( ) ) theLogger . logFine ( reqID + " HEAD " + url ) ;
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " header: " + requestHeader ) ;
2005-09-05 12:10:00 +02:00
// check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers
2008-08-02 14:12:04 +02:00
final String hostlow = host . toLowerCase ( ) ;
2006-03-21 21:55:59 +01:00
// re-calc the url path
String remotePath = ( args = = null ) ? path : ( path + " ? " + args ) ;
2008-03-26 16:37:49 +01:00
if ( plasmaSwitchboard . urlBlacklist . isListed ( indexReferenceBlacklist . BLACKLIST_PROXY , hostlow , remotePath ) ) {
2005-09-05 12:10:00 +02:00
httpd . sendRespondError ( conProp , respond , 4 , 403 , null ,
" URL ' " + hostlow + " ' blocked by yacy proxy (blacklisted) " , null ) ;
2007-08-09 23:58:38 +02:00
theLogger . logInfo ( " AGIS blocking of host ' " + hostlow + " ' " ) ;
2005-09-05 12:10:00 +02:00
return ;
}
2008-07-19 17:10:00 +02:00
prepareRequestHeader ( conProp , requestHeader , hostlow ) ;
2005-09-05 12:10:00 +02:00
// resolve yacy and yacyh domains
2008-10-27 23:16:56 +01:00
String yAddress = resolveYacyDomains ( host ) ;
2005-09-05 12:10:00 +02:00
2008-10-27 23:16:56 +01:00
// remove yacy-subdomain-path, when accessing /env
if ( ( yAddress ! = null )
& & ( remotePath . startsWith ( " /env " ) )
& & ( ( pos = yAddress . indexOf ( '/' ) ) ! = - 1 )
) yAddress = yAddress . substring ( 0 , yAddress . indexOf ( '/' ) ) ;
2005-09-05 12:10:00 +02:00
2008-07-19 17:10:00 +02:00
modifyProxyHeaders ( requestHeader , httpVer ) ;
2005-06-23 13:00:26 +02:00
2008-04-08 23:17:40 +02:00
// generate request-url
final String connectHost = hostPart ( host , port , yAddress ) ;
2008-04-05 15:17:16 +02:00
final String getUrl = " http:// " + connectHost + remotePath ;
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " using url: " + getUrl ) ;
2008-07-19 17:10:00 +02:00
final JakartaCommonsHttpClient client = setupHttpClient ( requestHeader , connectHost ) ;
2008-04-05 15:17:16 +02:00
2008-04-08 23:17:40 +02:00
// send request
2008-04-05 15:17:16 +02:00
try {
res = client . HEAD ( getUrl ) ;
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " response status: " + res . getStatusLine ( ) ) ;
2005-06-23 13:00:26 +02:00
2005-09-05 10:01:54 +02:00
// determine if it's an internal error of the httpc
2008-08-25 20:11:47 +02:00
final httpResponseHeader responseHeader = res . getResponseHeader ( ) ;
2008-04-08 11:34:20 +02:00
if ( responseHeader . size ( ) = = 0 ) {
2008-04-05 15:17:16 +02:00
throw new Exception ( res . getStatusLine ( ) ) ;
2005-09-05 10:01:54 +02:00
}
2008-04-08 11:34:20 +02:00
prepareResponseHeader ( responseHeader , res . getHttpVer ( ) ) ;
2005-06-23 13:00:26 +02:00
// sending the server respond back to the client
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " sending response header: " + responseHeader ) ;
2008-04-08 11:34:20 +02:00
httpd . sendRespondHeader ( conProp , respond , httpVer , res . getStatusCode ( ) , res . getStatusLine ( ) . substring ( 4 ) , responseHeader ) ;
2007-09-25 23:36:08 +02:00
respond . flush ( ) ;
2008-04-05 15:17:16 +02:00
} finally {
if ( res ! = null ) {
// ... close connection
res . closeStream ( ) ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-04-05 15:17:16 +02:00
handleProxyException ( e , conProp , respond , url ) ;
2005-06-09 12:22:05 +02:00
}
2005-04-19 08:55:57 +02:00
}
2008-04-08 23:17:40 +02:00
2008-08-25 20:11:47 +02:00
public static void doPost ( final Properties conProp , final httpRequestHeader requestHeader , final OutputStream respond , InputStream body ) throws IOException {
2008-04-08 23:17:40 +02:00
assert conProp ! = null : " precondition violated: conProp != null " ;
assert requestHeader ! = null : " precondition violated: requestHeader != null " ;
assert body ! = null : " precondition violated: body != null " ;
2007-09-05 11:01:35 +02:00
yacyURL url = null ;
2008-07-19 17:10:00 +02:00
httpdByteCountOutputStream countedRespond = null ;
2005-06-09 12:22:05 +02:00
try {
2008-07-19 17:10:00 +02:00
final int reqID = requestHeader . hashCode ( ) ;
2005-06-09 12:22:05 +02:00
// remembering the starting time of the request
2008-08-02 14:12:04 +02:00
final Date requestDate = new Date ( ) ; // remember the time...
2008-08-06 21:43:12 +02:00
conProp . put ( httpHeader . CONNECTION_PROP_REQUEST_START , Long . valueOf ( requestDate . getTime ( ) ) ) ;
2005-09-05 12:10:00 +02:00
if ( yacyTrigger ) de . anomic . yacy . yacyCore . triggerOnlineAction ( ) ;
2008-10-19 20:10:42 +02:00
sb . proxyLastAccess = System . currentTimeMillis ( ) ;
2005-06-09 12:22:05 +02:00
// using an ByteCount OutputStream to count the send bytes
2008-07-19 17:10:00 +02:00
countedRespond = new httpdByteCountOutputStream ( respond , conProp . getProperty ( httpHeader . CONNECTION_PROP_REQUESTLINE ) . length ( ) + 2 , " PROXY " ) ;
2005-06-09 12:22:05 +02:00
2005-09-20 23:49:47 +02:00
String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
2008-08-02 14:12:04 +02:00
final String path = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH ) ;
final String args = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ; // may be null if no args were given
final String httpVer = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ;
2006-03-21 21:55:59 +01:00
2005-06-09 12:22:05 +02:00
int port , pos ;
if ( ( pos = host . indexOf ( " : " ) ) < 0 ) {
port = 80 ;
2005-04-07 21:19:42 +02:00
} else {
2005-06-09 12:22:05 +02:00
port = Integer . parseInt ( host . substring ( pos + 1 ) ) ;
host = host . substring ( 0 , pos ) ;
2005-04-07 21:19:42 +02:00
}
2005-06-09 12:22:05 +02:00
2005-09-05 12:10:00 +02:00
try {
2007-09-05 11:01:35 +02:00
url = new yacyURL ( " http " , host , port , ( args = = null ) ? path : path + " ? " + args ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
final String errorMsg = " ERROR: internal error with url generation: host= " +
2005-09-05 12:10:00 +02:00
host + " , port= " + port + " , path= " + path + " , args= " + args ;
2008-07-19 17:10:00 +02:00
theLogger . logSevere ( errorMsg ) ;
httpd . sendRespondError ( conProp , countedRespond , 4 , 501 , null , errorMsg , e ) ;
2005-09-05 12:10:00 +02:00
return ;
}
2008-09-03 02:30:21 +02:00
if ( theLogger . isFine ( ) ) theLogger . logFine ( reqID + " POST " + url ) ;
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " header: " + requestHeader ) ;
2005-09-05 12:10:00 +02:00
2008-07-19 17:10:00 +02:00
prepareRequestHeader ( conProp , requestHeader , host . toLowerCase ( ) ) ;
2005-06-09 12:22:05 +02:00
2008-10-27 23:16:56 +01:00
String yAddress = resolveYacyDomains ( host ) ;
2005-06-09 12:22:05 +02:00
// re-calc the url path
String remotePath = ( args = = null ) ? path : ( path + " ? " + args ) ;
2008-10-27 23:16:56 +01:00
// remove yacy-subdomain-path, when accessing /env
if ( ( yAddress ! = null )
& & ( remotePath . startsWith ( " /env " ) )
& & ( ( pos = yAddress . indexOf ( '/' ) ) ! = - 1 )
) yAddress = yAddress . substring ( 0 , yAddress . indexOf ( '/' ) ) ;
2008-07-19 17:10:00 +02:00
modifyProxyHeaders ( requestHeader , httpVer ) ;
2008-04-05 15:17:16 +02:00
2008-04-08 23:17:40 +02:00
final String connectHost = hostPart ( host , port , yAddress ) ;
2008-04-05 15:17:16 +02:00
final String getUrl = " http:// " + connectHost + remotePath ;
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " using url: " + getUrl ) ;
2008-07-19 17:10:00 +02:00
final JakartaCommonsHttpClient client = setupHttpClient ( requestHeader , connectHost ) ;
2008-04-08 23:17:40 +02:00
// check input
if ( body = = null ) {
theLogger . logSevere ( " no body to POST! " ) ;
}
// from old httpc:
// "if there is a body to the call, we would have a CONTENT-LENGTH tag in the requestHeader"
// it seems that it is a HTTP/1.1 connection which stays open (the inputStream) and endlessly waits for
// input so we have to end it to do the request
2008-10-20 16:07:09 +02:00
final int contentLength = requestHeader . getContentLength ( ) ;
if ( contentLength > - 1 ) {
2008-08-23 01:46:32 +02:00
final byte [ ] bodyData ;
2008-10-20 16:07:09 +02:00
if ( contentLength = = 0 ) {
2008-08-23 01:46:32 +02:00
// no body
bodyData = new byte [ 0 ] ;
} else {
// read content-length bytes into memory
2008-10-20 16:07:09 +02:00
bodyData = new byte [ contentLength ] ;
2008-12-23 00:04:00 +01:00
int bytes_read = 0 ;
while ( bytes_read < contentLength ) {
bytes_read + = body . read ( bodyData , bytes_read , contentLength - bytes_read ) ;
}
2008-08-23 01:46:32 +02:00
}
body = new ByteArrayInputStream ( bodyData ) ;
2008-04-08 23:17:40 +02:00
}
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpResponse res = null ;
2008-04-05 15:17:16 +02:00
try {
2005-09-05 12:10:00 +02:00
// sending the request
2008-04-05 15:17:16 +02:00
res = client . POST ( getUrl , body ) ;
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " response status: " + res . getStatusLine ( ) ) ;
2005-09-05 12:10:00 +02:00
2008-08-25 20:11:47 +02:00
final httpResponseHeader responseHeader = res . getResponseHeader ( ) ;
2008-04-12 13:39:48 +02:00
// determine if it's an internal error of the httpc
2008-04-08 11:34:20 +02:00
if ( responseHeader . size ( ) = = 0 ) {
2008-04-05 15:17:16 +02:00
throw new Exception ( res . getStatusLine ( ) ) ;
2005-09-05 12:10:00 +02:00
}
2008-07-19 17:10:00 +02:00
final httpChunkedOutputStream chunked = setTransferEncoding ( conProp , responseHeader , res . getStatusCode ( ) , countedRespond ) ;
2005-09-05 12:10:00 +02:00
2008-12-23 00:04:00 +01:00
prepareResponseHeader ( responseHeader , res . getHttpVer ( ) ) ;
2006-02-16 10:20:57 +01:00
2005-09-05 12:10:00 +02:00
// sending the respond header back to the client
if ( chunked ! = null ) {
2008-08-25 20:11:47 +02:00
responseHeader . put ( httpResponseHeader . TRANSFER_ENCODING , " chunked " ) ;
2008-04-08 11:34:20 +02:00
}
2005-09-05 12:10:00 +02:00
// sending response headers
2008-09-03 02:30:21 +02:00
if ( theLogger . isFinest ( ) ) theLogger . logFinest ( reqID + " sending response header: " + responseHeader ) ;
2005-09-05 12:10:00 +02:00
httpd . sendRespondHeader ( conProp ,
2008-07-19 17:10:00 +02:00
countedRespond ,
2005-09-05 12:10:00 +02:00
httpVer ,
2008-04-05 15:17:16 +02:00
res . getStatusCode ( ) ,
res . getStatusLine ( ) . substring ( 4 ) , // status text
2008-04-08 11:34:20 +02:00
responseHeader ) ;
2005-09-05 12:10:00 +02:00
// respondHeader(respond, res.status, res.responseHeader);
2008-04-11 00:47:05 +02:00
// Saver.writeContent(res, (chunked != null) ? new BufferedOutputStream(chunked) : new BufferedOutputStream(respond));
/ *
2008-04-12 10:12:51 +02:00
// *** (Uebernommen aus Saver-Klasse: warum ist dies hier die einzige Methode, die einen OutputStream statt einen Writer benutzt?)
2008-04-11 00:47:05 +02:00
try {
serverFileUtils . copyToStream ( new BufferedInputStream ( res . getDataAsStream ( ) ) , ( chunked ! = null ) ? new BufferedOutputStream ( chunked ) : new BufferedOutputStream ( respond ) ) ;
} finally {
res . closeStream ( ) ;
}
2005-09-05 12:10:00 +02:00
if ( chunked ! = null ) chunked . finish ( ) ;
2008-04-11 00:47:05 +02:00
* /
2008-08-27 10:07:18 +02:00
final OutputStream outStream = ( chunked ! = null ) ? chunked : countedRespond ;
2008-12-23 20:14:54 +01:00
serverFileUtils . copy ( res . getDataAsStream ( ) , outStream ) ;
2005-09-05 12:10:00 +02:00
2008-08-27 22:46:34 +02:00
if ( chunked ! = null ) {
chunked . finish ( ) ;
}
outStream . flush ( ) ;
2008-12-23 12:30:24 +01:00
} catch ( SocketException se ) {
// connection closed by client, abort download
res . abort ( ) ;
2008-04-05 15:17:16 +02:00
} finally {
// if opened ...
if ( res ! = null ) {
// ... close connection
res . closeStream ( ) ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-07-19 17:10:00 +02:00
handleProxyException ( e , conProp , countedRespond , url ) ;
2005-06-09 12:22:05 +02:00
} finally {
2008-07-19 17:10:00 +02:00
if ( countedRespond ! = null ) {
countedRespond . flush ( ) ;
countedRespond . finish ( ) ;
}
if ( respond ! = null ) {
respond . flush ( ) ;
}
2005-06-09 12:22:05 +02:00
2008-08-06 21:43:12 +02:00
conProp . put ( httpHeader . CONNECTION_PROP_REQUEST_END , Long . valueOf ( System . currentTimeMillis ( ) ) ) ;
conProp . put ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_SIZE , ( countedRespond ! = null ) ? Long . valueOf ( countedRespond . getCount ( ) ) : - 1L ) ;
2007-08-09 23:58:38 +02:00
logProxyAccess ( conProp ) ;
2005-06-09 12:22:05 +02:00
}
2005-04-07 21:19:42 +02:00
}
2008-04-05 15:17:16 +02:00
2008-07-19 17:10:00 +02:00
/ * *
* resolve yacy and yacyh domains
*
* @param host
* @return
* /
2008-08-02 14:12:04 +02:00
private static String resolveYacyDomains ( final String host ) {
2008-08-17 12:16:32 +02:00
return ( httpd . getAlternativeResolver ( ) = = null ) ? null : httpd . getAlternativeResolver ( ) . resolve ( host ) ;
2008-07-19 17:10:00 +02:00
}
/ * *
* @param host
* @param port
* @param yAddress
* @return
* /
2008-08-02 14:12:04 +02:00
private static String hostPart ( final String host , final int port , final String yAddress ) {
2008-07-19 17:10:00 +02:00
final String connectHost = ( yAddress = = null ) ? host + " : " + port : yAddress ;
return connectHost ;
}
/ * *
* @param conProp
* @param requestHeader
* @param hostlow
* /
2008-08-25 20:11:47 +02:00
private static void prepareRequestHeader ( final Properties conProp , final httpRequestHeader requestHeader , final String hostlow ) {
2008-07-19 17:10:00 +02:00
// set another userAgent, if not yellow-listed
if ( ( yellowList ! = null ) & & ( ! ( yellowList . contains ( domain ( hostlow ) ) ) ) ) {
// change the User-Agent
requestHeader . put ( httpHeader . USER_AGENT , generateUserAgent ( requestHeader ) ) ;
}
2008-09-09 16:04:52 +02:00
// only gzip-encoding is supported, remove other encodings (e. g. deflate)
if ( ( ( String ) requestHeader . get ( httpRequestHeader . ACCEPT_ENCODING , " " ) ) . indexOf ( " gzip " ) ! = - 1 ) {
requestHeader . put ( httpRequestHeader . ACCEPT_ENCODING , " gzip " ) ;
} else {
requestHeader . put ( httpRequestHeader . ACCEPT_ENCODING , " " ) ;
}
2008-07-19 17:10:00 +02:00
addXForwardedForHeader ( conProp , requestHeader ) ;
}
2008-08-02 14:12:04 +02:00
private static String domain ( final String host ) {
2008-07-19 17:10:00 +02:00
String domain = host ;
int pos = domain . lastIndexOf ( " . " ) ;
if ( pos > = 0 ) {
// truncate from last part
domain = domain . substring ( 0 , pos ) ;
pos = domain . lastIndexOf ( " . " ) ;
if ( pos > = 0 ) {
// truncate from first part
domain = domain . substring ( pos + 1 ) ;
}
}
return domain ;
}
/ * *
* creates a new HttpClient and sets parameters according to proxy needs
*
* @param requestHeader
2008-08-02 15:57:00 +02:00
* @param connectHost may be ' host : port ' or ' host : port / path '
2008-07-19 17:10:00 +02:00
* @return
* /
2008-08-25 20:11:47 +02:00
private static JakartaCommonsHttpClient setupHttpClient ( final httpRequestHeader requestHeader , final String connectHost ) {
2008-07-19 17:10:00 +02:00
// setup HTTP-client
2008-08-17 12:16:32 +02:00
final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient ( timeout , requestHeader ) ;
2008-07-19 17:10:00 +02:00
client . setFollowRedirects ( false ) ;
// cookies are handled by the user's browser
client . setIgnoreCookies ( true ) ;
2008-08-02 15:57:00 +02:00
client . setProxy ( httpRemoteProxyConfig . getProxyConfigForURI ( connectHost ) ) ;
2008-07-19 17:10:00 +02:00
return client ;
}
/ * *
* determines in which form the response should be send and sets header accordingly
* if the content length is not set we need to use chunked content encoding
* Implemented :
* if ! content - length
* switch httpVer
* case 0 . 9 :
* case 1 . 0 :
* close connection after transfer
* break ;
* default :
* new ChunkedStream around respond
* end if
*
* @param conProp
* @param responseHeader
* @param statusCode
* @param respond
* @return
* /
2008-08-25 20:11:47 +02:00
private static httpChunkedOutputStream setTransferEncoding (
final Properties conProp , final httpResponseHeader responseHeader ,
2008-08-02 14:12:04 +02:00
final int statusCode , final OutputStream respond ) {
2008-07-19 17:10:00 +02:00
final String httpVer = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ;
httpChunkedOutputStream chunkedOut = null ;
// gzipped response is ungzipped an therefor the length is unknown
2008-08-25 20:11:47 +02:00
if ( responseHeader . gzip ( ) | | responseHeader . getContentLength ( ) < 0 ) {
2008-07-19 17:10:00 +02:00
// according to http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
// a 204,304 message must not contain a message body.
// Therefore we need to set the content-length to 0.
if ( statusCode = = 204 | | statusCode = = 304 ) {
responseHeader . put ( httpHeader . CONTENT_LENGTH , " 0 " ) ;
} else {
if ( httpVer . equals ( httpHeader . HTTP_VERSION_0_9 ) | | httpVer . equals ( httpHeader . HTTP_VERSION_1_0 ) ) {
forceConnectionClose ( conProp ) ;
} else {
chunkedOut = new httpChunkedOutputStream ( respond ) ;
}
responseHeader . remove ( httpHeader . CONTENT_LENGTH ) ;
}
}
return chunkedOut ;
}
2008-04-08 11:34:20 +02:00
/ * *
* @param res
* @param responseHeader
* /
2008-08-25 20:11:47 +02:00
private static void prepareResponseHeader ( final httpResponseHeader responseHeader , final String httpVer ) {
2008-07-19 17:10:00 +02:00
modifyProxyHeaders ( responseHeader , httpVer ) ;
2008-04-08 11:34:20 +02:00
correctContentEncoding ( responseHeader ) ;
}
/ * *
* @param responseHeader
* /
2008-08-25 20:11:47 +02:00
private static void correctContentEncoding ( final httpResponseHeader responseHeader ) {
2008-04-08 11:34:20 +02:00
// TODO gzip again? set "correct" encoding?
if ( responseHeader . gzip ( ) ) {
2008-08-25 20:11:47 +02:00
responseHeader . remove ( httpResponseHeader . CONTENT_ENCODING ) ;
2008-04-12 13:39:48 +02:00
responseHeader . remove ( httpHeader . CONTENT_LENGTH ) ; // remove gziped length
2008-04-08 11:34:20 +02:00
}
}
2008-04-05 15:17:16 +02:00
/ * *
* adds the client - IP of conProp to the requestHeader
*
* @param conProp
* @param requestHeader
* /
2008-08-25 20:11:47 +02:00
private static void addXForwardedForHeader ( final Properties conProp , final httpRequestHeader requestHeader ) {
2008-04-05 15:17:16 +02:00
// setting the X-Forwarded-For Header
2008-10-19 20:10:42 +02:00
if ( sb . getConfigBool ( " proxy.sendXForwardedForHeader " , true ) ) {
2008-04-08 23:17:40 +02:00
requestHeader . put ( httpHeader . X_FORWARDED_FOR , conProp . getProperty ( httpHeader . CONNECTION_PROP_CLIENTIP ) ) ;
2008-04-05 15:17:16 +02:00
}
}
/ * *
* removing hop by hop headers and adding additional headers
*
* @param requestHeader
* @param httpVer
* /
2008-07-19 17:10:00 +02:00
private static void modifyProxyHeaders ( final httpHeader requestHeader , final String httpVer ) {
2008-04-05 15:17:16 +02:00
removeHopByHopHeaders ( requestHeader ) ;
setViaHeader ( requestHeader , httpVer ) ;
}
2008-07-19 17:10:00 +02:00
2008-08-02 14:12:04 +02:00
private static void removeHopByHopHeaders ( final httpHeader headers ) {
2008-07-19 17:10:00 +02:00
/ *
- Trailers
* /
2008-08-25 20:11:47 +02:00
headers . remove ( httpRequestHeader . CONNECTION ) ;
headers . remove ( httpRequestHeader . KEEP_ALIVE ) ;
headers . remove ( httpRequestHeader . UPGRADE ) ;
headers . remove ( httpRequestHeader . TE ) ;
headers . remove ( httpRequestHeader . PROXY_CONNECTION ) ;
headers . remove ( httpRequestHeader . PROXY_AUTHENTICATE ) ;
headers . remove ( httpRequestHeader . PROXY_AUTHORIZATION ) ;
2008-07-19 17:10:00 +02:00
// special headers inserted by squid
2008-08-25 20:11:47 +02:00
headers . remove ( httpRequestHeader . X_CACHE ) ;
headers . remove ( httpRequestHeader . X_CACHE_LOOKUP ) ;
2008-07-19 17:10:00 +02:00
// remove transfer encoding header
2008-08-25 20:11:47 +02:00
headers . remove ( httpResponseHeader . TRANSFER_ENCODING ) ;
2008-07-19 17:10:00 +02:00
//removing yacy status headers
2008-08-25 20:11:47 +02:00
headers . remove ( httpResponseHeader . X_YACY_KEEP_ALIVE_REQUEST_COUNT ) ;
headers . remove ( httpResponseHeader . X_YACY_ORIGINAL_REQUEST_LINE ) ;
2008-07-19 17:10:00 +02:00
}
2008-08-02 14:12:04 +02:00
private static void setViaHeader ( final httpHeader header , final String httpVer ) {
2008-10-19 20:10:42 +02:00
if ( ! sb . getConfigBool ( " proxy.sendViaHeader " , true ) ) return ;
2008-08-17 12:16:32 +02:00
final String myAddress = ( httpd . getAlternativeResolver ( ) = = null ) ? null : httpd . getAlternativeResolver ( ) . myAlternativeAddress ( ) ;
2008-07-19 17:10:00 +02:00
if ( myAddress ! = null ) {
2005-06-09 12:22:05 +02:00
2008-07-19 17:10:00 +02:00
// getting header set by other proxies in the chain
2008-12-04 13:54:16 +01:00
final StringBuilder viaValue = new StringBuilder ( ) ;
2008-07-19 17:10:00 +02:00
if ( header . containsKey ( httpHeader . VIA ) ) viaValue . append ( header . get ( httpHeader . VIA ) ) ;
if ( viaValue . length ( ) > 0 ) viaValue . append ( " , " ) ;
// appending info about this peer
viaValue
. append ( httpVer ) . append ( " " )
. append ( myAddress ) . append ( " " )
2008-10-19 20:10:42 +02:00
. append ( " (YaCy " ) . append ( sb . getConfig ( " vString " , " 0.0 " ) ) . append ( " ) " ) ;
2008-07-19 17:10:00 +02:00
// storing header back
header . put ( httpHeader . VIA , new String ( viaValue ) ) ;
}
}
2008-08-25 20:11:47 +02:00
public static void doConnect ( final Properties conProp , final httpRequestHeader requestHeader , final InputStream clientIn , final OutputStream clientOut ) throws IOException {
2007-08-09 23:58:38 +02:00
2008-10-19 20:10:42 +02:00
sb . proxyLastAccess = System . currentTimeMillis ( ) ;
2008-07-19 17:10:00 +02:00
2005-09-20 23:49:47 +02:00
String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
2008-08-02 14:12:04 +02:00
final String httpVersion = conProp . getProperty ( httpHeader . CONNECTION_PROP_HTTP_VER ) ;
2006-03-21 21:55:59 +01:00
String path = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH ) ;
final String args = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ;
if ( args ! = null ) { path = path + " ? " + args ; }
2008-07-19 17:10:00 +02:00
2005-09-08 16:48:32 +02:00
int port , pos ;
if ( ( pos = host . indexOf ( " : " ) ) < 0 ) {
port = 80 ;
} else {
port = Integer . parseInt ( host . substring ( pos + 1 ) ) ;
host = host . substring ( 0 , pos ) ;
2006-03-21 21:55:59 +01:00
}
2008-07-19 17:10:00 +02:00
2005-09-07 13:17:21 +02:00
// check the blacklist
// blacklist idea inspired by [AS]:
// respond a 404 for all AGIS ("all you get is shit") servers
2006-03-21 21:55:59 +01:00
final String hostlow = host . toLowerCase ( ) ;
2008-03-26 16:37:49 +01:00
if ( plasmaSwitchboard . urlBlacklist . isListed ( indexReferenceBlacklist . BLACKLIST_PROXY , hostlow , path ) ) {
2005-09-07 13:17:21 +02:00
httpd . sendRespondError ( conProp , clientOut , 4 , 403 , null ,
" URL ' " + hostlow + " ' blocked by yacy proxy (blacklisted) " , null ) ;
2007-08-09 23:58:38 +02:00
theLogger . logInfo ( " AGIS blocking of host ' " + hostlow + " ' " ) ;
forceConnectionClose ( conProp ) ;
2005-09-07 13:17:21 +02:00
return ;
}
2008-07-19 17:10:00 +02:00
2005-06-09 12:22:05 +02:00
// possibly branch into PROXY-PROXY connection
2008-08-02 15:57:00 +02:00
final httpRemoteProxyConfig proxyConfig = httpRemoteProxyConfig . getRemoteProxyConfig ( ) ;
2005-10-23 10:59:11 +02:00
if (
2008-04-05 15:17:16 +02:00
( proxyConfig ! = null ) & &
( proxyConfig . useProxy ( ) ) & &
( proxyConfig . useProxy4SSL ( ) )
2005-10-23 10:59:11 +02:00
) {
2008-08-02 14:12:04 +02:00
final JakartaCommonsHttpClient remoteProxy = new JakartaCommonsHttpClient ( timeout , requestHeader , proxyConfig ) ;
2008-04-12 13:39:48 +02:00
remoteProxy . setFollowRedirects ( false ) ; // should not be needed, but safe is safe
2008-07-19 17:10:00 +02:00
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpResponse response = null ;
2008-04-05 15:17:16 +02:00
try {
response = remoteProxy . CONNECT ( host , port ) ;
// outputs a logline to the serverlog with the current status
2008-07-19 17:10:00 +02:00
theLogger . logInfo ( " CONNECT-RESPONSE: status= " + response . getStatusLine ( ) + " , header= " + response . getResponseHeader ( ) . toString ( ) ) ;
2008-04-05 15:17:16 +02:00
// (response.getStatusLine().charAt(0) == '2') || (response.getStatusLine().charAt(0) == '3')
final boolean success = response . getStatusCode ( ) > = 200 & & response . getStatusCode ( ) < = 399 ;
if ( success ) {
2005-06-09 12:22:05 +02:00
// replace connection details
2008-04-05 15:17:16 +02:00
host = proxyConfig . getProxyHost ( ) ;
port = proxyConfig . getProxyPort ( ) ;
2005-06-09 12:22:05 +02:00
// go on (see below)
2005-04-19 08:55:57 +02:00
} else {
2005-06-09 12:22:05 +02:00
// pass error response back to client
2008-04-05 15:17:16 +02:00
httpd . sendRespondHeader ( conProp , clientOut , httpVersion , response . getStatusCode ( ) , response . getStatusLine ( ) . substring ( 4 ) , response . getResponseHeader ( ) ) ;
2005-06-09 12:22:05 +02:00
//respondHeader(clientOut, response.status, response.responseHeader);
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-06-09 12:22:05 +02:00
return ;
2005-04-19 08:55:57 +02:00
}
2008-12-23 12:30:24 +01:00
} catch ( SocketException se ) {
// connection closed by client, abort download
response . abort ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-04-19 08:55:57 +02:00
throw new IOException ( e . getMessage ( ) ) ;
2008-04-05 15:17:16 +02:00
} finally {
if ( response ! = null ) {
// release connection
response . closeStream ( ) ;
}
2005-04-19 08:55:57 +02:00
}
2006-03-21 21:55:59 +01:00
}
2008-07-19 17:10:00 +02:00
2005-06-09 12:22:05 +02:00
// try to establish connection to remote host
2008-08-02 14:12:04 +02:00
final Socket sslSocket = new Socket ( host , port ) ;
2005-06-09 12:22:05 +02:00
sslSocket . setSoTimeout ( timeout ) ; // waiting time for write
sslSocket . setSoLinger ( true , timeout ) ; // waiting time for read
2008-08-02 14:12:04 +02:00
final InputStream promiscuousIn = sslSocket . getInputStream ( ) ;
final OutputStream promiscuousOut = sslSocket . getOutputStream ( ) ;
2005-06-09 12:22:05 +02:00
// now then we can return a success message
2007-12-14 20:17:54 +01:00
clientOut . write ( ( httpVersion + " 200 Connection established " + serverCore . CRLF_STRING +
" Proxy-agent: YACY " + serverCore . CRLF_STRING +
serverCore . CRLF_STRING ) . getBytes ( ) ) ;
2005-06-09 12:22:05 +02:00
2007-08-09 23:58:38 +02:00
theLogger . logInfo ( " SSL connection to " + host + " : " + port + " established. " ) ;
2005-06-09 12:22:05 +02:00
// start stream passing with mediate processes
2008-08-02 14:12:04 +02:00
final Mediate cs = new Mediate ( sslSocket , clientIn , promiscuousOut ) ;
final Mediate sc = new Mediate ( sslSocket , promiscuousIn , clientOut ) ;
2005-12-07 00:51:29 +01:00
cs . start ( ) ;
sc . start ( ) ;
while ( ( sslSocket ! = null ) & &
( sslSocket . isBound ( ) ) & &
( ! ( sslSocket . isClosed ( ) ) ) & &
( sslSocket . isConnected ( ) ) & &
( ( cs . isAlive ( ) ) | | ( sc . isAlive ( ) ) ) ) {
// idle
2008-08-02 14:12:04 +02:00
try { Thread . sleep ( 1000 ) ; } catch ( final InterruptedException e ) { } // wait a while
2005-06-09 12:22:05 +02:00
}
2005-12-07 00:51:29 +01:00
// set stop mode
cs . pleaseTerminate ( ) ;
sc . pleaseTerminate ( ) ;
// wake up thread
cs . interrupt ( ) ;
sc . interrupt ( ) ;
// ...hope they have terminated...
2005-04-07 21:19:42 +02:00
}
2008-07-19 17:10:00 +02:00
2007-08-09 23:58:38 +02:00
public static class Mediate extends Thread {
2005-06-09 12:22:05 +02:00
boolean terminate ;
Socket socket ;
InputStream in ;
OutputStream out ;
2008-08-02 14:12:04 +02:00
public Mediate ( final Socket socket , final InputStream in , final OutputStream out ) {
2005-06-09 12:22:05 +02:00
this . terminate = false ;
this . in = in ;
this . out = out ;
this . socket = socket ;
}
public void run ( ) {
2008-08-02 14:12:04 +02:00
final byte [ ] buffer = new byte [ 512 ] ;
2005-06-09 12:22:05 +02:00
int len ;
try {
while ( ( socket ! = null ) & &
( socket . isBound ( ) ) & &
( ! ( socket . isClosed ( ) ) ) & &
( socket . isConnected ( ) ) & &
( ! ( terminate ) ) & &
( in ! = null ) & &
( out ! = null ) & &
( ( len = in . read ( buffer ) ) > = 0 )
) {
out . write ( buffer , 0 , len ) ;
}
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) { }
2005-06-09 12:22:05 +02:00
}
public void pleaseTerminate ( ) {
terminate = true ;
}
2005-04-07 21:19:42 +02:00
}
2005-06-09 12:22:05 +02:00
2008-08-02 14:12:04 +02:00
private static void handleProxyException ( final Exception e , final Properties conProp , final OutputStream respond , final yacyURL url ) {
2005-09-05 12:10:00 +02:00
// this may happen if
// - the targeted host does not exist
// - anything with the remote server was wrong.
// - the client unexpectedly closed the connection ...
try {
// doing some errorhandling ...
int httpStatusCode = 404 ;
String httpStatusText = null ;
String errorMessage = null ;
2005-11-20 16:37:23 +01:00
Exception errorExc = null ;
2005-09-05 12:10:00 +02:00
boolean unknownError = false ;
2006-02-14 10:55:09 +01:00
// for customized error messages
boolean detailedErrorMsg = false ;
String detailedErrorMsgFile = null ;
serverObjects detailedErrorMsgMap = null ;
2005-09-05 12:10:00 +02:00
if ( e instanceof ConnectException ) {
httpStatusCode = 403 ; httpStatusText = " Connection refused " ;
errorMessage = " Connection refused by destination host " ;
} else if ( e instanceof BindException ) {
errorMessage = " Unable to establish a connection to the destination host " ;
} else if ( e instanceof NoRouteToHostException ) {
errorMessage = " No route to destination host " ;
} else if ( e instanceof UnknownHostException ) {
2006-02-14 10:55:09 +01:00
//errorMessage = "IP address of the destination host could not be determined";
try {
detailedErrorMsgMap = unknownHostHandling ( conProp ) ;
httpStatusText = " Unknown Host " ;
detailedErrorMsg = true ;
detailedErrorMsgFile = " proxymsg/unknownHost.inc " ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e1 ) {
2006-02-14 10:55:09 +01:00
errorMessage = " IP address of the destination host could not be determined " ;
}
2005-11-25 01:40:35 +01:00
} else if ( e instanceof SocketTimeoutException ) {
errorMessage = " Unable to establish a connection to the destination host. Connect timed out. " ;
2005-09-05 12:10:00 +02:00
} else {
2008-08-02 14:12:04 +02:00
final String exceptionMsg = e . getMessage ( ) ;
2005-09-05 12:10:00 +02:00
if ( ( exceptionMsg ! = null ) & & ( exceptionMsg . indexOf ( " Corrupt GZIP trailer " ) > = 0 ) ) {
// just do nothing, we leave it this way
2008-09-03 02:30:21 +02:00
if ( theLogger . isFine ( ) ) theLogger . logFine ( " ignoring bad gzip trail for URL " + url + " ( " + e . getMessage ( ) + " ) " ) ;
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-09-05 12:10:00 +02:00
} else if ( ( exceptionMsg ! = null ) & & ( exceptionMsg . indexOf ( " Connection reset " ) > = 0 ) ) {
errorMessage = " Connection reset " ;
2006-02-14 10:55:09 +01:00
} else if ( ( exceptionMsg ! = null ) & & ( exceptionMsg . indexOf ( " unknown host " ) > = 0 ) ) {
try {
detailedErrorMsgMap = unknownHostHandling ( conProp ) ;
httpStatusText = " Unknown Host " ;
detailedErrorMsg = true ;
detailedErrorMsgFile = " proxymsg/unknownHost.inc " ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e1 ) {
2006-02-14 10:55:09 +01:00
errorMessage = " IP address of the destination host could not be determined " ;
}
2005-09-05 12:45:56 +02:00
} else if ( ( exceptionMsg ! = null ) & &
(
( exceptionMsg . indexOf ( " socket write error " ) > = 0 ) | |
2005-10-10 13:31:46 +02:00
( exceptionMsg . indexOf ( " Read timed out " ) > = 0 ) | |
2006-10-15 11:18:51 +02:00
( exceptionMsg . indexOf ( " Broken pipe " ) > = 0 ) | |
( exceptionMsg . indexOf ( " server has closed connection " ) > = 0 )
2006-02-14 10:55:09 +01:00
) ) {
2005-09-05 12:10:00 +02:00
errorMessage = exceptionMsg ;
2008-04-12 13:39:48 +02:00
e . printStackTrace ( ) ;
2005-09-05 12:10:00 +02:00
} else {
errorMessage = " Unexpected Error. " + e . getClass ( ) . getName ( ) + " : " + e . getMessage ( ) ;
unknownError = true ;
2005-11-20 16:37:23 +01:00
errorExc = e ;
2005-09-05 12:10:00 +02:00
}
}
// sending back an error message to the client
2005-09-20 23:49:47 +02:00
if ( ! conProp . containsKey ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_HEADER ) ) {
2006-02-14 10:55:09 +01:00
if ( detailedErrorMsg ) {
httpd . sendRespondError ( conProp , respond , httpStatusCode , httpStatusText , new File ( detailedErrorMsgFile ) , detailedErrorMsgMap , errorExc ) ;
} else {
httpd . sendRespondError ( conProp , respond , 4 , httpStatusCode , httpStatusText , errorMessage , errorExc ) ;
}
2005-09-05 12:10:00 +02:00
} else {
if ( unknownError ) {
2008-04-12 13:39:48 +02:00
theLogger . logSevere ( " Unknown Error while processing request ' " +
2005-09-20 23:49:47 +02:00
conProp . getProperty ( httpHeader . CONNECTION_PROP_REQUESTLINE , " unknown " ) + " ': " +
2005-09-05 12:10:00 +02:00
" \ n " + Thread . currentThread ( ) . getName ( ) +
" \ n " + errorMessage , e ) ;
} else {
2008-04-10 08:56:06 +02:00
theLogger . logWarning ( " Error while processing request ' " +
2005-09-20 23:49:47 +02:00
conProp . getProperty ( httpHeader . CONNECTION_PROP_REQUESTLINE , " unknown " ) + " ': " +
2005-09-05 12:10:00 +02:00
" \ n " + Thread . currentThread ( ) . getName ( ) +
" \ n " + errorMessage ) ;
}
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-09-05 12:10:00 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception ee ) {
2007-08-09 23:58:38 +02:00
forceConnectionClose ( conProp ) ;
2005-09-05 12:10:00 +02:00
}
}
2008-08-02 14:12:04 +02:00
private static void forceConnectionClose ( final Properties conProp ) {
2008-07-19 17:10:00 +02:00
if ( conProp ! = null ) {
conProp . setProperty ( httpHeader . CONNECTION_PROP_PERSISTENT , " close " ) ;
}
}
2008-08-02 14:12:04 +02:00
private static serverObjects unknownHostHandling ( final Properties conProp ) throws Exception {
final serverObjects detailedErrorMsgMap = new serverObjects ( ) ;
2006-02-14 10:55:09 +01:00
// generic toplevel domains
2008-08-02 14:12:04 +02:00
final HashSet < String > topLevelDomains = new HashSet < String > ( Arrays . asList ( new String [ ] {
2006-02-14 10:55:09 +01:00
" aero " , // Fluggesellschaften/Luftfahrt
" arpa " , // Einrichtung des ARPANet
" biz " , // Business
" com " , // Commercial
" coop " , // genossenschaftliche Unternehmen
" edu " , // Education
" gov " , // Government
" info " , // Informationsangebote
" int " , // International
" jobs " , // Jobangebote von Unternemen
2006-02-17 22:23:45 +01:00
" mil " , // Military (US-Militaer)
2006-02-14 10:55:09 +01:00
// "museum", // Museen
" name " , // Privatpersonen
" nato " , // NATO (veraltet)
" net " , // Net (Netzwerkbetreiber)
" org " , // Organization (Nichtkommerzielle Organisation)
" pro " , // Professionals
" travel " , // Touristikindustrie
// some country tlds
" de " ,
" at " ,
" ch " ,
" it " ,
" uk "
} ) ) ;
// getting some connection properties
String orgHostPort = " 80 " ;
String orgHostName = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST , " unknown " ) . toLowerCase ( ) ;
int pos = orgHostName . indexOf ( " : " ) ;
if ( pos ! = - 1 ) {
orgHostPort = orgHostName . substring ( pos + 1 ) ;
orgHostName = orgHostName . substring ( 0 , pos ) ;
}
2008-08-02 14:12:04 +02:00
final String orgHostPath = conProp . getProperty ( httpHeader . CONNECTION_PROP_PATH , " " ) ;
2006-02-14 10:55:09 +01:00
String orgHostArgs = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS , " " ) ;
if ( orgHostArgs . length ( ) > 0 ) orgHostArgs = " ? " + orgHostArgs ;
detailedErrorMsgMap . put ( " hostName " , orgHostName ) ;
// guessing hostnames
2008-08-02 14:12:04 +02:00
final HashSet < String > testHostNames = new HashSet < String > ( ) ;
2006-02-14 10:55:09 +01:00
String testHostName = null ;
if ( ! orgHostName . startsWith ( " www. " ) ) {
testHostName = " www. " + orgHostName ;
2008-08-02 14:12:04 +02:00
final InetAddress addr = serverDomains . dnsResolve ( testHostName ) ;
2006-02-14 10:55:09 +01:00
if ( addr ! = null ) testHostNames . add ( testHostName ) ;
} else if ( orgHostName . startsWith ( " www. " ) ) {
testHostName = orgHostName . substring ( 4 ) ;
2008-08-02 14:12:04 +02:00
final InetAddress addr = serverDomains . dnsResolve ( testHostName ) ;
2006-02-14 10:55:09 +01:00
if ( addr ! = null ) if ( addr ! = null ) testHostNames . add ( testHostName ) ;
}
if ( orgHostName . length ( ) > 4 & & orgHostName . startsWith ( " www " ) & & ( orgHostName . charAt ( 3 ) ! = '.' ) ) {
testHostName = orgHostName . substring ( 0 , 3 ) + " . " + orgHostName . substring ( 3 ) ;
2008-08-02 14:12:04 +02:00
final InetAddress addr = serverDomains . dnsResolve ( testHostName ) ;
2006-02-14 10:55:09 +01:00
if ( addr ! = null ) if ( addr ! = null ) testHostNames . add ( testHostName ) ;
}
pos = orgHostName . lastIndexOf ( " . " ) ;
if ( pos ! = - 1 ) {
2008-08-02 14:12:04 +02:00
final Iterator < String > iter = topLevelDomains . iterator ( ) ;
2006-02-14 10:55:09 +01:00
while ( iter . hasNext ( ) ) {
2008-08-02 14:12:04 +02:00
final String topLevelDomain = iter . next ( ) ;
2006-02-14 10:55:09 +01:00
testHostName = orgHostName . substring ( 0 , pos ) + " . " + topLevelDomain ;
2008-08-02 14:12:04 +02:00
final InetAddress addr = serverDomains . dnsResolve ( testHostName ) ;
2006-02-14 10:55:09 +01:00
if ( addr ! = null ) if ( addr ! = null ) testHostNames . add ( testHostName ) ;
}
}
int hostNameCount = 0 ;
2008-08-02 14:12:04 +02:00
final Iterator < String > iter = testHostNames . iterator ( ) ;
2006-02-14 10:55:09 +01:00
while ( iter . hasNext ( ) ) {
2008-01-28 19:21:08 +01:00
testHostName = iter . next ( ) ;
2006-02-14 10:55:09 +01:00
detailedErrorMsgMap . put ( " list_ " + hostNameCount + " _hostName " , testHostName ) ;
detailedErrorMsgMap . put ( " list_ " + hostNameCount + " _hostPort " , orgHostPort ) ;
detailedErrorMsgMap . put ( " list_ " + hostNameCount + " _hostPath " , orgHostPath ) ;
detailedErrorMsgMap . put ( " list_ " + hostNameCount + " _hostArgs " , orgHostArgs ) ;
hostNameCount + + ;
}
2008-01-02 00:03:02 +01:00
detailedErrorMsgMap . put ( " list " , hostNameCount ) ;
if ( hostNameCount ! = 0 ) {
detailedErrorMsgMap . put ( " showList " , 1 ) ;
} else {
detailedErrorMsgMap . put ( " showList " , 0 ) ;
}
2006-02-14 10:55:09 +01:00
return detailedErrorMsgMap ;
}
2008-08-02 14:12:04 +02:00
private static synchronized String generateUserAgent ( final httpHeader requestHeaders ) {
2007-08-09 23:58:38 +02:00
userAgentStr . setLength ( 0 ) ;
2005-08-30 15:41:47 +02:00
2008-08-02 14:12:04 +02:00
final String browserUserAgent = ( String ) requestHeaders . get ( httpHeader . USER_AGENT , HTTPLoader . yacyUserAgent ) ;
final int pos = browserUserAgent . lastIndexOf ( ')' ) ;
2005-08-30 15:41:47 +02:00
if ( pos > = 0 ) {
2007-08-09 23:58:38 +02:00
userAgentStr
2005-08-30 15:41:47 +02:00
. append ( browserUserAgent . substring ( 0 , pos ) )
. append ( " ; YaCy " )
2008-10-19 20:10:42 +02:00
. append ( sb . getConfig ( " vString " , " 0.1 " ) )
2005-08-30 15:41:47 +02:00
. append ( " ; yacy.net " )
. append ( browserUserAgent . substring ( pos ) ) ;
} else {
2007-08-09 23:58:38 +02:00
userAgentStr . append ( browserUserAgent ) ;
2005-08-30 15:41:47 +02:00
}
2007-08-09 23:58:38 +02:00
return new String ( userAgentStr ) ;
2005-08-30 15:41:47 +02:00
}
2005-06-09 12:22:05 +02:00
/ * *
* This function is used to generate a logging message according to the
* < a href = " http://www.squid-cache.org/Doc/FAQ/FAQ-6.html " > squid logging format < / a > . < p >
* e . g . < br >
* < code > 1117528623 . 857 178 192 . 168 . 1 . 201 TCP_MISS / 200 1069 GET http : //www.yacy.de/ - DIRECT/81.169.145.74 text/html</code>
* /
2008-08-02 14:12:04 +02:00
private final static synchronized void logProxyAccess ( final Properties conProp ) {
2005-06-09 12:22:05 +02:00
if ( ! doAccessLogging ) return ;
2007-08-09 23:58:38 +02:00
logMessage . setLength ( 0 ) ;
2005-06-09 12:22:05 +02:00
// Timestamp
2008-08-02 14:12:04 +02:00
final String currentTimestamp = Long . toString ( System . currentTimeMillis ( ) ) ;
final int offset = currentTimestamp . length ( ) - 3 ;
2005-06-09 12:22:05 +02:00
2007-08-09 23:58:38 +02:00
logMessage . append ( currentTimestamp . substring ( 0 , offset ) ) ;
logMessage . append ( '.' ) ;
logMessage . append ( currentTimestamp . substring ( offset ) ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Elapsed time
2008-08-02 14:12:04 +02:00
final Long requestStart = ( Long ) conProp . get ( httpHeader . CONNECTION_PROP_REQUEST_START ) ;
final Long requestEnd = ( Long ) conProp . get ( httpHeader . CONNECTION_PROP_REQUEST_END ) ;
final String elapsed = Long . toString ( requestEnd . longValue ( ) - requestStart . longValue ( ) ) ;
2005-06-09 12:22:05 +02:00
2007-08-09 23:58:38 +02:00
for ( int i = 0 ; i < 6 - elapsed . length ( ) ; i + + ) logMessage . append ( ' ' ) ;
logMessage . append ( elapsed ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Remote Host
2008-08-02 14:12:04 +02:00
final String clientIP = conProp . getProperty ( httpHeader . CONNECTION_PROP_CLIENTIP ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( clientIP ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Code/Status
2008-08-02 14:12:04 +02:00
final String respondStatus = conProp . getProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_STATUS ) ;
final String respondCode = conProp . getProperty ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_CODE , " UNKNOWN " ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( respondCode ) ;
logMessage . append ( " / " ) ;
logMessage . append ( respondStatus ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Bytes
2008-08-02 14:12:04 +02:00
final Long bytes = ( Long ) conProp . get ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_SIZE ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( bytes . toString ( ) ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Method
2008-08-02 14:12:04 +02:00
final String requestMethod = conProp . getProperty ( httpHeader . CONNECTION_PROP_METHOD ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( requestMethod ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// URL
2008-08-02 14:12:04 +02:00
final String requestURL = conProp . getProperty ( httpHeader . CONNECTION_PROP_URL ) ;
final String requestArgs = conProp . getProperty ( httpHeader . CONNECTION_PROP_ARGS ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( requestURL ) ;
2005-08-30 13:37:54 +02:00
if ( requestArgs ! = null ) {
2007-08-09 23:58:38 +02:00
logMessage . append ( " ? " )
2005-08-30 13:37:54 +02:00
. append ( requestArgs ) ;
}
2007-08-09 23:58:38 +02:00
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Rfc931
2007-08-09 23:58:38 +02:00
logMessage . append ( " - " ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Peerstatus/Peerhost
2008-08-02 14:12:04 +02:00
final String host = conProp . getProperty ( httpHeader . CONNECTION_PROP_HOST ) ;
2007-08-09 23:58:38 +02:00
logMessage . append ( " DIRECT/ " ) ;
logMessage . append ( host ) ;
logMessage . append ( ' ' ) ;
2005-06-09 12:22:05 +02:00
// Type
String mime = " - " ;
2007-08-09 23:58:38 +02:00
if ( conProp . containsKey ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_HEADER ) ) {
2008-08-02 14:12:04 +02:00
final httpHeader proxyRespondHeader = ( httpHeader ) conProp . get ( httpHeader . CONNECTION_PROP_PROXY_RESPOND_HEADER ) ;
2005-06-09 12:22:05 +02:00
mime = proxyRespondHeader . mime ( ) ;
if ( mime . indexOf ( " ; " ) ! = - 1 ) {
mime = mime . substring ( 0 , mime . indexOf ( " ; " ) ) ;
}
}
2007-08-09 23:58:38 +02:00
logMessage . append ( mime ) ;
2005-06-09 12:22:05 +02:00
// sending the logging message to the logger
2008-09-03 02:30:21 +02:00
if ( proxyLog . isFine ( ) ) proxyLog . logFine ( logMessage . toString ( ) ) ;
2005-04-07 21:19:42 +02:00
}
2005-06-09 12:22:05 +02:00
2005-04-07 21:19:42 +02:00
}
/ *
2005-06-09 12:22:05 +02:00
proxy test :
http : //www.chipchapin.com/WebTools/cookietest.php?
http : //xlists.aza.org/moderator/cookietest/cookietest1.php
http : //vancouver-webpages.com/proxy/cache-test.html
2005-09-29 22:15:31 +02:00
* /