2010-02-04 12:26:23 +01:00
// Work.java
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 04.02.2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 6539 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data ;
import java.io.File ;
import java.io.IOException ;
2010-08-19 14:13:54 +02:00
import java.util.ArrayList ;
import java.util.Collection ;
2010-02-04 12:26:23 +01:00
import java.util.Date ;
2010-08-19 14:13:54 +02:00
import java.util.LinkedHashMap ;
import java.util.Map ;
2010-02-04 12:26:23 +01:00
2010-08-23 00:32:39 +02:00
import net.yacy.cora.protocol.http.HTTPClient ;
2010-02-04 12:26:23 +01:00
import net.yacy.kelondro.blob.Tables ;
2010-12-06 15:34:58 +01:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.word.WordReference ;
import net.yacy.kelondro.index.HandleSet ;
2010-06-15 21:44:05 +02:00
import net.yacy.kelondro.index.RowSpaceExceededException ;
2010-02-04 12:26:23 +01:00
import net.yacy.kelondro.logging.Log ;
2010-12-06 15:34:58 +01:00
import net.yacy.kelondro.rwi.IndexCell ;
2010-02-04 12:26:23 +01:00
import net.yacy.kelondro.util.DateFormatter ;
import de.anomic.server.serverObjects ;
public class WorkTables extends Tables {
2010-10-18 23:09:41 +02:00
2010-02-04 12:26:23 +01:00
public final static String TABLE_API_NAME = " api " ;
public final static String TABLE_API_TYPE_STEERING = " steering " ;
public final static String TABLE_API_TYPE_CONFIGURATION = " configuration " ;
public final static String TABLE_API_TYPE_CRAWLER = " crawler " ;
public final static String TABLE_API_COL_TYPE = " type " ;
public final static String TABLE_API_COL_COMMENT = " comment " ;
2010-08-18 17:56:38 +02:00
public final static String TABLE_API_COL_DATE_RECORDING = " date_recording " ; // if not present default to old date field
public final static String TABLE_API_COL_DATE_LAST_EXEC = " date_last_exec " ; // if not present default to old date field
public final static String TABLE_API_COL_DATE_NEXT_EXEC = " date_next_exec " ; // if not present default to zero
public final static String TABLE_API_COL_DATE = " date " ; // old date; do not set in new records
2010-02-04 12:26:23 +01:00
public final static String TABLE_API_COL_URL = " url " ;
2010-08-18 17:56:38 +02:00
public final static String TABLE_API_COL_APICALL_PK = " apicall_pk " ; // the primary key for the table entry of that api call (not really a database field, only a name in the apicall)
public final static String TABLE_API_COL_APICALL_COUNT = " apicall_count " ; // counts how often the API was called (starts with 1)
public final static String TABLE_API_COL_APICALL_SCHEDULE_TIME = " apicall_schedule_time " ; // factor for SCHEULE_UNIT time units
public final static String TABLE_API_COL_APICALL_SCHEDULE_UNIT = " apicall_schedule_unit " ; // may be 'minutes', 'hours', 'days'
2010-08-31 17:47:47 +02:00
2010-03-04 12:58:07 +01:00
public final static String TABLE_ROBOTS_NAME = " robots " ;
2010-08-31 17:47:47 +02:00
public final static String TABLE_ACTIVECRAWLS_NAME = " crawljobsActive " ;
public final static String TABLE_PASSIVECRAWLS_NAME = " crawljobsPassive " ;
2010-12-06 15:34:58 +01:00
public final static String TABLE_SEARCH_FAILURE_NAME = " searchfl " ;
public final static String TABLE_SEARCH_FAILURE_COL_URL = " url " ;
public final static String TABLE_SEARCH_FAILURE_COL_DATE = " date " ;
public final static String TABLE_SEARCH_FAILURE_COL_WORDS = " words " ;
public final static String TABLE_SEARCH_FAILURE_COL_COMMENT = " comment " ;
2010-08-31 17:47:47 +02:00
2010-10-18 23:09:41 +02:00
public YMarkTables bookmarks ;
2010-02-04 12:26:23 +01:00
2010-04-13 03:16:09 +02:00
public WorkTables ( final File workPath ) {
2010-02-04 12:26:23 +01:00
super ( workPath , 12 ) ;
2010-10-18 23:09:41 +02:00
this . bookmarks = new YMarkTables ( this ) ;
2010-02-04 12:26:23 +01:00
}
2010-10-21 21:18:17 +02:00
public void clear ( final String tablename ) throws IOException {
super . clear ( tablename ) ;
2010-11-03 07:47:02 +01:00
this . bookmarks . clearIndex ( tablename ) ;
2010-10-21 21:18:17 +02:00
}
2010-08-19 14:13:54 +02:00
/ * *
* recording of a api call . stores the call parameters into the API database table
* @param post the post arguments of the api call
* @param servletName the name of the servlet
* @param type name of the servlet category
* @param comment visual description of the process
2010-08-26 18:01:45 +02:00
* @return the pk of the new entry in the api table
2010-08-19 14:13:54 +02:00
* /
2010-08-26 18:01:45 +02:00
public byte [ ] recordAPICall ( final serverObjects post , final String servletName , final String type , final String comment ) {
2010-08-18 17:56:38 +02:00
// remove the apicall attributes from the post object
2010-08-26 18:01:45 +02:00
String pks = post . remove ( TABLE_API_COL_APICALL_PK ) ;
byte [ ] pk = pks = = null ? null : pks . getBytes ( ) ;
2010-08-18 17:56:38 +02:00
// generate the apicall url - without the apicall attributes
2010-04-13 03:16:09 +02:00
final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ " / " + servletName + " ? " + post . toString ( ) ;
2010-08-18 17:56:38 +02:00
// read old entry from the apicall table (if exists)
Row row = null ;
2010-02-04 12:26:23 +01:00
try {
2010-08-26 18:01:45 +02:00
row = ( pk = = null ) ? null : super . select ( TABLE_API_NAME , pk ) ;
2010-08-18 17:56:38 +02:00
} catch ( IOException e ) {
Log . logException ( e ) ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
// insert or update entry
try {
2010-08-20 01:52:38 +02:00
if ( row = = null ) {
2010-08-18 17:56:38 +02:00
// create and insert new entry
Data data = new Data ( ) ;
data . put ( TABLE_API_COL_TYPE , type . getBytes ( ) ) ;
data . put ( TABLE_API_COL_COMMENT , comment . getBytes ( ) ) ;
byte [ ] date = DateFormatter . formatShortMilliSecond ( new Date ( ) ) . getBytes ( ) ;
data . put ( TABLE_API_COL_DATE_RECORDING , date ) ;
data . put ( TABLE_API_COL_DATE_LAST_EXEC , date ) ;
data . put ( TABLE_API_COL_URL , apiurl . getBytes ( ) ) ;
// insert APICALL attributes
2010-08-20 01:52:38 +02:00
data . put ( TABLE_API_COL_APICALL_COUNT , " 1 " ) ;
2010-08-26 18:01:45 +02:00
pk = super . insert ( TABLE_API_NAME , data ) ;
2010-08-20 01:52:38 +02:00
} else {
// modify and update existing entry
// modify date attributes and patch old values
row . put ( TABLE_API_COL_DATE_LAST_EXEC , DateFormatter . formatShortMilliSecond ( new Date ( ) ) . getBytes ( ) ) ;
if ( ! row . containsKey ( TABLE_API_COL_DATE_RECORDING ) ) row . put ( TABLE_API_COL_DATE_RECORDING , row . get ( TABLE_API_COL_DATE ) ) ;
row . remove ( TABLE_API_COL_DATE ) ;
// insert APICALL attributes
row . put ( TABLE_API_COL_APICALL_COUNT , row . get ( TABLE_API_COL_APICALL_COUNT , 1 ) + 1 ) ;
super . update ( TABLE_API_NAME , row ) ;
2010-08-26 18:01:45 +02:00
assert pk ! = null ;
2010-08-18 17:56:38 +02:00
}
2010-02-04 12:26:23 +01:00
} catch ( IOException e ) {
Log . logException ( e ) ;
2010-06-15 21:44:05 +02:00
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
2010-02-04 12:26:23 +01:00
}
Log . logInfo ( " APICALL " , apiurl ) ;
2010-08-26 18:01:45 +02:00
return pk ;
2010-02-04 12:26:23 +01:00
}
2010-08-20 01:52:38 +02:00
/ * *
* store a API call and set attributes to schedule a re - call of that API call according to a given frequence
* This is the same as the previous method but it also computes a re - call time and stores that additionally
* @param post the post arguments of the api call
* @param servletName the name of the servlet
* @param type name of the servlet category
* @param comment visual description of the process
* @param time the time until next scheduled execution of this api call
* @param unit the time unit for the scheduled call
2010-08-26 18:01:45 +02:00
* @return the pk of the new entry in the api table
2010-08-20 01:52:38 +02:00
* /
2010-08-26 18:01:45 +02:00
public byte [ ] recordAPICall ( final serverObjects post , final String servletName , final String type , final String comment , int time , String unit ) {
2010-08-20 01:52:38 +02:00
if ( post . containsKey ( TABLE_API_COL_APICALL_PK ) ) {
// this api call has already been stored somewhere.
2010-08-26 18:01:45 +02:00
return recordAPICall ( post , servletName , type , comment ) ;
2010-08-20 01:52:38 +02:00
}
if ( time < 0 | | unit = = null | | unit . length ( ) = = 0 | | " minutes,hours,days " . indexOf ( unit ) < 0 ) {
time = 0 ; unit = " " ;
} else {
if ( unit . equals ( " minutes " ) & & time < 10 ) time = 10 ;
}
// generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ " / " + servletName + " ? " + post . toString ( ) ;
2010-08-26 18:01:45 +02:00
byte [ ] pk = null ;
2010-08-20 01:52:38 +02:00
// insert entry
try {
// create and insert new entry
Data data = new Data ( ) ;
data . put ( TABLE_API_COL_TYPE , type . getBytes ( ) ) ;
data . put ( TABLE_API_COL_COMMENT , comment . getBytes ( ) ) ;
byte [ ] date = DateFormatter . formatShortMilliSecond ( new Date ( ) ) . getBytes ( ) ;
data . put ( TABLE_API_COL_DATE_RECORDING , date ) ;
data . put ( TABLE_API_COL_DATE_LAST_EXEC , date ) ;
data . put ( TABLE_API_COL_URL , apiurl . getBytes ( ) ) ;
// insert APICALL attributes
data . put ( TABLE_API_COL_APICALL_COUNT , " 1 " . getBytes ( ) ) ;
data . put ( TABLE_API_COL_APICALL_SCHEDULE_TIME , Integer . toString ( time ) . getBytes ( ) ) ;
data . put ( TABLE_API_COL_APICALL_SCHEDULE_UNIT , unit . getBytes ( ) ) ;
calculateAPIScheduler ( data , false ) ; // set next execution time
2010-08-26 18:01:45 +02:00
pk = super . insert ( TABLE_API_NAME , data ) ;
2010-08-20 01:52:38 +02:00
} catch ( IOException e ) {
Log . logException ( e ) ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
Log . logInfo ( " APICALL " , apiurl ) ;
2010-08-26 18:01:45 +02:00
return pk ;
2010-08-20 01:52:38 +02:00
}
2010-08-19 14:13:54 +02:00
/ * *
* execute an API call using a api table row which contains all essentials
* to access the server also the host , port and the authentication realm must be given
* @param pks a collection of primary keys denoting the rows in the api table
* @param host the host where the api shall be called
* @param port the port on the host
* @param realm authentification realm
* @return a map of the called urls and the http status code of the api call or - 1 if any other IOException occurred
* /
2010-09-28 14:18:54 +02:00
public Map < String , Integer > execAPICalls ( String host , int port , String realm , Collection < String > pks ) {
2010-08-19 14:13:54 +02:00
// now call the api URLs and store the result status
2010-08-23 00:32:39 +02:00
final HTTPClient client = new HTTPClient ( ) ;
2010-08-19 14:13:54 +02:00
client . setRealm ( realm ) ;
client . setTimout ( 120000 ) ;
LinkedHashMap < String , Integer > l = new LinkedHashMap < String , Integer > ( ) ;
for ( String pk : pks ) {
Tables . Row row = null ;
try {
row = select ( WorkTables . TABLE_API_NAME , pk . getBytes ( ) ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
if ( row = = null ) continue ;
String url = " http:// " + host + " : " + port + new String ( row . get ( WorkTables . TABLE_API_COL_URL ) ) ;
url + = " & " + WorkTables . TABLE_API_COL_APICALL_PK + " = " + new String ( row . getPK ( ) ) ;
try {
client . GETbytes ( url ) ;
l . put ( url , client . getStatusCode ( ) ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
l . put ( url , - 1 ) ;
}
}
return l ;
}
2010-09-28 14:18:54 +02:00
public static int execAPICall ( String host , int port , String realm , String path , byte [ ] pk ) {
// now call the api URLs and store the result status
final HTTPClient client = new HTTPClient ( ) ;
client . setRealm ( realm ) ;
client . setTimout ( 120000 ) ;
String url = " http:// " + host + " : " + port + path ;
if ( pk ! = null ) url + = " & " + WorkTables . TABLE_API_COL_APICALL_PK + " = " + new String ( pk ) ;
try {
client . GETbytes ( url ) ;
return client . getStatusCode ( ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
return - 1 ;
}
}
2010-08-19 14:13:54 +02:00
/ * *
* simplified call to execute a single entry in the api database table
* @param pk the primary key of the entry
* @param host the host where the api shall be called
* @param port the port on the host
* @param realm authentification realm
* @return the http status code of the api call or - 1 if any other IOException occurred
* /
public int execAPICall ( String pk , String host , int port , String realm ) {
ArrayList < String > pks = new ArrayList < String > ( ) ;
pks . add ( pk ) ;
2010-09-28 14:18:54 +02:00
Map < String , Integer > m = execAPICalls ( host , port , realm , pks ) ;
2010-08-19 14:13:54 +02:00
if ( m . isEmpty ( ) ) return - 1 ;
return m . values ( ) . iterator ( ) . next ( ) . intValue ( ) ;
}
/ * *
* calculate the execution time in a api call table based on given scheduling time and last execution time
* @param row the database row in the api table
2010-08-20 01:52:38 +02:00
* @param update if true then the next execution time is based on the latest computed execution time ; othervise it is based on the last execution time
2010-08-19 14:13:54 +02:00
* /
2010-08-20 01:52:38 +02:00
public static void calculateAPIScheduler ( Tables . Data row , boolean update ) {
2010-08-26 18:42:00 +02:00
Date date = row . containsKey ( WorkTables . TABLE_API_COL_DATE ) ? row . get ( WorkTables . TABLE_API_COL_DATE , ( Date ) null ) : null ;
2010-08-19 14:13:54 +02:00
date = update ? row . get ( WorkTables . TABLE_API_COL_DATE_NEXT_EXEC , date ) : row . get ( WorkTables . TABLE_API_COL_DATE_LAST_EXEC , date ) ;
int time = row . get ( WorkTables . TABLE_API_COL_APICALL_SCHEDULE_TIME , 1 ) ;
if ( time < = 0 ) {
2010-08-26 18:42:00 +02:00
row . put ( WorkTables . TABLE_API_COL_DATE_NEXT_EXEC , " " ) ;
2010-08-19 14:13:54 +02:00
return ;
}
String unit = row . get ( WorkTables . TABLE_API_COL_APICALL_SCHEDULE_UNIT , " days " ) ;
long d = date . getTime ( ) ;
2010-08-20 01:52:38 +02:00
if ( unit . equals ( " minutes " ) ) d + = 60000L * Math . max ( 10 , time ) ;
2010-08-19 14:13:54 +02:00
if ( unit . equals ( " hours " ) ) d + = 60000L * 60L * time ;
if ( unit . equals ( " days " ) ) d + = 60000L * 60L * 24L * time ;
if ( d < System . currentTimeMillis ( ) ) d = System . currentTimeMillis ( ) + 600000L ;
2010-08-20 01:52:38 +02:00
d - = d % 60000 ; // remove seconds
2010-08-19 14:13:54 +02:00
row . put ( WorkTables . TABLE_API_COL_DATE_NEXT_EXEC , new Date ( d ) ) ;
}
2010-12-06 15:34:58 +01:00
public void failURLsRegisterMissingWord ( IndexCell < WordReference > indexCell , final DigestURI url , HandleSet queryHashes , final String reason ) {
// remove words from index
for ( byte [ ] word : queryHashes ) {
indexCell . removeDelayed ( word , url . hash ( ) ) ;
}
// insert information about changed url into database
try {
// create and insert new entry
Data data = new Data ( ) ;
byte [ ] date = DateFormatter . formatShortMilliSecond ( new Date ( ) ) . getBytes ( ) ;
data . put ( TABLE_SEARCH_FAILURE_COL_URL , url . toNormalform ( true , false ) ) ;
data . put ( TABLE_SEARCH_FAILURE_COL_DATE , date ) ;
data . put ( TABLE_SEARCH_FAILURE_COL_WORDS , queryHashes . export ( ) ) ;
data . put ( TABLE_SEARCH_FAILURE_COL_COMMENT , reason . getBytes ( ) ) ;
super . insert ( TABLE_SEARCH_FAILURE_NAME , url . hash ( ) , data ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
}
}
public boolean failURLsContains ( byte [ ] urlhash ) {
try {
return super . has ( TABLE_SEARCH_FAILURE_NAME , urlhash ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
return false ;
}
}
2010-02-04 12:26:23 +01:00
}