2005-12-29 18:45:50 +01:00
import java.io.IOException ;
2007-06-01 16:44:46 +02:00
import java.io.Writer ;
2005-12-29 18:45:50 +01:00
import java.net.MalformedURLException ;
2008-09-20 00:19:11 +02:00
import java.util.Set ;
2005-12-29 18:45:50 +01:00
2008-07-04 13:03:03 +02:00
import de.anomic.crawler.HTTPLoader ;
2007-06-01 16:44:46 +02:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
import de.anomic.htmlFilter.htmlFilterWriter ;
2009-02-19 17:24:46 +01:00
import de.anomic.http.httpClient ;
2008-08-25 20:11:47 +02:00
import de.anomic.http.httpRequestHeader ;
2009-01-31 02:06:56 +01:00
import de.anomic.kelondro.util.FileUtils ;
2008-07-05 02:35:20 +02:00
import de.anomic.plasma.plasmaSwitchboard ;
2005-12-29 18:45:50 +01:00
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-12-29 18:45:50 +01:00
2006-02-17 21:55:31 +01:00
public class getpageinfo_p {
2008-07-04 13:03:03 +02:00
2008-08-25 20:11:47 +02:00
public static serverObjects respond ( final httpRequestHeader header , final serverObjects post , final serverSwitch < ? > env ) {
2008-08-02 14:12:04 +02:00
final plasmaSwitchboard sb = ( plasmaSwitchboard ) env ;
final serverObjects prop = new serverObjects ( ) ;
2008-09-19 16:27:44 +02:00
// avoid UNRESOLVED PATTERN
prop . put ( " title " , " " ) ;
prop . put ( " desc " , " " ) ;
prop . put ( " lang " , " " ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " robots-allowed " , " 3 " ) ; //unknown
2008-09-19 16:27:44 +02:00
prop . put ( " sitemap " , " " ) ;
prop . put ( " favicon " , " " ) ;
// default actions
String actions = " title,robots " ;
2005-12-29 18:45:50 +01:00
if ( post ! = null & & post . containsKey ( " url " ) ) {
2006-02-17 21:55:31 +01:00
if ( post . containsKey ( " actions " ) )
2008-06-06 18:01:27 +02:00
actions = post . get ( " actions " ) ;
String url = post . get ( " url " ) ;
2006-09-09 14:28:28 +02:00
if ( url . toLowerCase ( ) . startsWith ( " ftp:// " ) ) {
2007-10-24 23:38:19 +02:00
prop . put ( " robots-allowed " , " 1 " ) ;
2008-10-22 20:59:04 +02:00
prop . putXML ( " title " , " FTP: " + url ) ;
2006-09-09 14:34:24 +02:00
return prop ;
} else if ( ! ( url . toLowerCase ( ) . startsWith ( " http:// " ) | | url . toLowerCase ( ) . startsWith ( " https:// " ) ) ) {
2006-02-17 21:55:31 +01:00
url = " http:// " + url ;
}
if ( actions . indexOf ( " title " ) > = 0 ) {
try {
2008-08-02 14:12:04 +02:00
final yacyURL u = new yacyURL ( url , null ) ;
2008-08-25 20:11:47 +02:00
final httpRequestHeader reqHeader = new httpRequestHeader ( ) ;
reqHeader . put ( httpRequestHeader . USER_AGENT , HTTPLoader . yacyUserAgent ) ; // do not set the crawler user agent, because this page was loaded by manual entering of the url
2009-02-19 17:24:46 +01:00
final byte [ ] r = httpClient . wget ( u . toString ( ) , reqHeader , 5000 ) ;
2008-04-22 00:42:49 +02:00
if ( r = = null ) return prop ;
2008-08-02 14:12:04 +02:00
final String contentString = new String ( r ) ;
2007-06-01 16:44:46 +02:00
2008-08-02 14:12:04 +02:00
final htmlFilterContentScraper scraper = new htmlFilterContentScraper ( u ) ;
2007-06-01 16:44:46 +02:00
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
2008-08-02 14:12:04 +02:00
final Writer writer = new htmlFilterWriter ( null , null , scraper , null , false ) ;
2009-01-31 02:06:56 +01:00
FileUtils . copy ( contentString , writer ) ;
2007-06-01 16:44:46 +02:00
writer . close ( ) ;
2007-06-09 17:22:37 +02:00
// put the document title
2008-10-22 20:59:04 +02:00
prop . putXML ( " title " , scraper . getTitle ( ) ) ;
2007-06-09 17:22:37 +02:00
// put the favicon that belongs to the document
2007-10-24 23:38:19 +02:00
prop . put ( " favicon " , ( scraper . getFavicon ( ) = = null ) ? " " : scraper . getFavicon ( ) . toString ( ) ) ;
2007-06-09 17:22:37 +02:00
// put keywords
2008-08-02 14:12:04 +02:00
final String list [ ] = scraper . getKeywords ( ) ;
2008-05-18 14:48:57 +02:00
int count = 0 ;
2007-06-01 16:44:46 +02:00
for ( int i = 0 ; i < list . length ; i + + ) {
2008-05-18 14:48:57 +02:00
String tag = list [ i ] ;
2008-09-18 23:01:23 +02:00
if ( ! tag . equals ( " " ) ) {
2008-10-22 20:59:04 +02:00
prop . putXML ( " tags_ " + count + " _tag " , tag ) ;
2008-05-18 14:48:57 +02:00
count + + ;
}
2006-02-17 21:55:31 +01:00
}
2008-05-18 14:48:57 +02:00
prop . put ( " tags " , count ) ;
2008-09-18 23:01:23 +02:00
// put description
2008-10-22 20:59:04 +02:00
prop . putXML ( " desc " , scraper . getDescription ( ) ) ;
2008-09-20 00:19:11 +02:00
// put language
Set < String > languages = scraper . getContentLanguages ( ) ;
2008-10-22 20:59:04 +02:00
prop . putXML ( " lang " , ( languages = = null ) ? " unknown " : languages . iterator ( ) . next ( ) ) ;
2006-02-17 21:55:31 +01:00
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) { /* ignore this */
} catch ( final IOException e ) { /* ignore this */
2005-12-29 18:45:50 +01:00
}
2006-02-17 21:55:31 +01:00
}
if ( actions . indexOf ( " robots " ) > = 0 ) {
try {
2008-08-02 14:12:04 +02:00
final yacyURL theURL = new yacyURL ( url , null ) ;
2008-09-18 23:01:23 +02:00
2007-05-06 11:52:04 +02:00
// determine if crawling of the current URL is allowed
2008-07-05 02:35:20 +02:00
prop . put ( " robots-allowed " , sb . robots . isDisallowed ( theURL ) ? " 0 " : " 1 " ) ;
2007-05-06 11:52:04 +02:00
// get the sitemap URL of the domain
2008-08-02 14:12:04 +02:00
final yacyURL sitemapURL = sb . robots . getSitemapURL ( theURL ) ;
2008-10-22 20:59:04 +02:00
prop . putXML ( " sitemap " , ( sitemapURL = = null ) ? " " : sitemapURL . toString ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) { }
2006-02-17 21:55:31 +01:00
}
2005-12-29 18:45:50 +01:00
}
// return rewrite properties
return prop ;
}
}