2005-05-03 11:47:56 +02:00
//AbstractParser.java
//------------------------
//part of YaCy
2008-07-20 19:14:51 +02:00
//(C) by Michael Peter Christen; mc@yacy.net
2005-05-03 11:47:56 +02:00
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-10-18 02:53:43 +02:00
package net.yacy.document ;
2005-05-03 11:47:56 +02:00
import java.io.BufferedInputStream ;
import java.io.ByteArrayInputStream ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2006-09-20 14:25:07 +02:00
import java.io.IOException ;
2005-05-03 11:47:56 +02:00
import java.io.InputStream ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.workflow.WorkflowThread ;
2009-10-10 01:13:30 +02:00
2005-05-03 11:47:56 +02:00
/ * *
2009-10-18 02:53:43 +02:00
* New classes implementing the { @link net . yacy . document . Idiom } interface
2005-05-03 11:47:56 +02:00
* can extend this class to inherit all functions already implemented in this class .
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
* /
2009-07-10 00:25:31 +02:00
public abstract class AbstractParser implements Idiom {
2006-09-30 11:31:53 +02:00
2005-06-13 15:49:17 +02:00
/ * *
* the logger class that should be used by the parser module for logging
* purposes .
* /
2009-07-08 23:48:08 +02:00
protected final Log theLogger = new Log ( " PARSER " ) ;
2005-11-29 08:27:58 +01:00
/ * *
* Parser name
* /
2009-07-10 16:22:17 +02:00
private String parserName ;
2005-06-13 15:49:17 +02:00
2006-09-30 11:31:53 +02:00
/ * *
* The source file file size in bytes if the source document was passed
* in as file
* /
2006-10-03 13:05:48 +02:00
protected long contentLength = - 1 ;
2006-09-30 11:31:53 +02:00
2005-05-03 11:47:56 +02:00
/ * *
* The Constructor of this class .
* /
2009-07-10 16:22:17 +02:00
public AbstractParser ( String name ) {
2009-07-10 17:02:34 +02:00
this . parserName = name ;
2005-05-03 11:47:56 +02:00
}
2006-10-03 13:05:48 +02:00
/ * *
* Set the content length of the source file .
* This value is needed by some parsers to decide
* if the parsed text could be hold in memory
* /
2008-08-02 14:12:04 +02:00
public void setContentLength ( final long length ) {
2006-10-03 13:05:48 +02:00
this . contentLength = length ;
}
2005-05-03 11:47:56 +02:00
2006-09-20 14:25:07 +02:00
/ * *
* Check if the parser was interrupted .
* @throws InterruptedException if the parser was interrupted
* /
2006-09-03 16:59:00 +02:00
public static final void checkInterruption ( ) throws InterruptedException {
2008-08-02 14:12:04 +02:00
final Thread currentThread = Thread . currentThread ( ) ;
2009-10-11 02:12:19 +02:00
if ( ( currentThread instanceof WorkflowThread ) & & ( ( WorkflowThread ) currentThread ) . shutdownInProgress ( ) ) throw new InterruptedException ( " Shutdown in progress ... " ) ;
2006-09-03 16:59:00 +02:00
if ( currentThread . isInterrupted ( ) ) throw new InterruptedException ( " Shutdown in progress ... " ) ;
}
2008-08-02 14:12:04 +02:00
public final File createTempFile ( final String name ) throws IOException {
2006-09-20 14:25:07 +02:00
String parserClassName = this . getClass ( ) . getName ( ) ;
int idx = parserClassName . lastIndexOf ( " . " ) ;
if ( idx ! = - 1 ) {
parserClassName = parserClassName . substring ( idx + 1 ) ;
}
// getting the file extension
idx = name . lastIndexOf ( " / " ) ;
2008-08-02 14:12:04 +02:00
final String fileName = ( idx ! = - 1 ) ? name . substring ( idx + 1 ) : name ;
2006-09-20 14:25:07 +02:00
idx = fileName . lastIndexOf ( " . " ) ;
2008-08-02 14:12:04 +02:00
final String fileExt = ( idx > - 1 ) ? fileName . substring ( idx + 1 ) : " " ;
2006-09-20 14:25:07 +02:00
// creates the temp file
2008-08-02 14:12:04 +02:00
final File tempFile = File . createTempFile ( parserClassName + " _ " + ( ( idx > - 1 ) ? fileName . substring ( 0 , idx ) : fileName ) , ( fileExt . length ( ) > 0 ) ? " . " + fileExt : fileExt ) ;
2006-09-20 14:25:07 +02:00
return tempFile ;
}
2009-10-11 02:12:19 +02:00
public int parseDir ( final DigestURI location , final String prefix , final File dir , final Document doc )
2007-05-19 01:13:44 +02:00
throws ParserException , InterruptedException , IOException {
if ( ! dir . isDirectory ( ) )
throw new ParserException ( " tried to parse ordinary file " + dir + " as directory " , location ) ;
2008-08-02 14:12:04 +02:00
final String [ ] files = dir . list ( ) ;
2007-05-19 01:13:44 +02:00
int result = 0 ;
for ( int i = 0 ; i < files . length ; i + + ) {
checkInterruption ( ) ;
2008-08-02 14:12:04 +02:00
final File file = new File ( dir , files [ i ] ) ;
2007-05-19 01:13:44 +02:00
this . theLogger . logFine ( " parsing file " + location + " # " + file + " in archive... " ) ;
if ( file . isDirectory ( ) ) {
result + = parseDir ( location , prefix , file , doc ) ;
} else try {
2009-10-11 02:12:19 +02:00
final DigestURI url = DigestURI . newURL ( location , " / " + prefix + " / "
2007-05-19 01:13:44 +02:00
// XXX: workaround for relative paths within document
+ file . getPath ( ) . substring ( file . getPath ( ) . indexOf ( File . separatorChar ) + 1 )
+ " / " + file . getName ( ) ) ;
2009-07-10 16:22:17 +02:00
final Document subdoc = Parser . parseSource ( url , Parser . mimeOf ( url ) , null , file ) ;
2007-05-19 01:13:44 +02:00
// TODO: change anchors back to use '#' after archive name
doc . addSubDocument ( subdoc ) ;
subdoc . close ( ) ;
result + + ;
2008-08-02 14:12:04 +02:00
} catch ( final ParserException e ) {
2007-05-19 01:13:44 +02:00
this . theLogger . logInfo ( " unable to parse file " + file + " in " + location + " , skipping " ) ;
}
}
return result ;
}
2005-05-03 11:47:56 +02:00
/ * *
* Parsing a document available as byte array .
* @param location the origin of the document
* @param mimeType the mimetype of the document
2006-09-15 14:52:46 +02:00
* @param charset the supposed charset of the document or < code > null < / code > if unkown
2005-05-03 11:47:56 +02:00
* @param source the content byte array
2009-07-08 23:48:08 +02:00
* @return a { @link Document } containing the extracted plain text of the document
2005-05-03 11:47:56 +02:00
* and some additional metadata .
* @throws ParserException if the content could not be parsed properly
*
2009-10-18 02:53:43 +02:00
* @see net . yacy . document . Idiom # parse ( de . anomic . net . URL , java . lang . String , byte [ ] )
2005-05-03 11:47:56 +02:00
* /
2009-07-08 23:48:08 +02:00
public Document parse (
2009-10-11 02:12:19 +02:00
final DigestURI location ,
2008-08-02 14:12:04 +02:00
final String mimeType ,
final String charset ,
final byte [ ] source
2006-09-03 16:59:00 +02:00
) throws ParserException , InterruptedException {
2005-06-17 09:58:02 +02:00
ByteArrayInputStream contentInputStream = null ;
try {
2006-09-20 14:25:07 +02:00
// convert the byte array into a stream
2005-06-17 09:58:02 +02:00
contentInputStream = new ByteArrayInputStream ( source ) ;
2006-09-20 14:25:07 +02:00
// parse the stream
2006-09-15 14:52:46 +02:00
return this . parse ( location , mimeType , charset , contentInputStream ) ;
2005-06-17 09:58:02 +02:00
} finally {
if ( contentInputStream ! = null ) {
try {
contentInputStream . close ( ) ;
contentInputStream = null ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) { /* ignore this */ }
2005-06-17 09:58:02 +02:00
}
}
2005-05-03 11:47:56 +02:00
}
/ * *
* Parsing a document stored in a { @link File }
* @param location the origin of the document
* @param mimeType the mimetype of the document
2006-09-15 14:52:46 +02:00
* @param charset the supposed charset of the document or < code > null < / code > if unkown
2005-05-03 11:47:56 +02:00
* @param sourceFile the file containing the content of the document
2009-07-08 23:48:08 +02:00
* @return a { @link Document } containing the extracted plain text of the document
2005-05-03 11:47:56 +02:00
* and some additional metadata .
* @throws ParserException if the content could not be parsed properly
*
2009-10-18 02:53:43 +02:00
* @see net . yacy . document . Idiom # parse ( de . anomic . net . URL , java . lang . String , java . io . File )
2005-05-03 11:47:56 +02:00
* /
2009-07-08 23:48:08 +02:00
public Document parse (
2009-10-11 02:12:19 +02:00
final DigestURI location ,
2008-08-02 14:12:04 +02:00
final String mimeType ,
final String charset ,
final File sourceFile
2006-09-15 14:52:46 +02:00
) throws ParserException , InterruptedException {
2005-05-03 11:47:56 +02:00
BufferedInputStream contentInputStream = null ;
try {
2006-09-30 11:31:53 +02:00
// getting the file size of the document
2006-10-03 13:05:48 +02:00
this . contentLength = sourceFile . length ( ) ;
2006-09-30 11:31:53 +02:00
2006-09-20 14:25:07 +02:00
// create a stream from the file
2005-05-03 11:47:56 +02:00
contentInputStream = new BufferedInputStream ( new FileInputStream ( sourceFile ) ) ;
2006-09-20 14:25:07 +02:00
// parse the stream
2006-09-15 14:52:46 +02:00
return this . parse ( location , mimeType , charset , contentInputStream ) ;
2008-08-02 14:12:04 +02:00
} catch ( final FileNotFoundException e ) {
2006-09-20 14:25:07 +02:00
throw new ParserException ( " Unexpected error while parsing file. " + e . getMessage ( ) , location ) ;
2005-06-16 10:34:52 +02:00
} finally {
2008-08-02 14:12:04 +02:00
if ( contentInputStream ! = null ) try { contentInputStream . close ( ) ; } catch ( final Exception e ) { /* ignore this */ }
2005-05-03 11:47:56 +02:00
}
}
/ * *
* Parsing a document available as { @link InputStream }
* @param location the origin of the document
* @param mimeType the mimetype of the document
2006-09-15 14:52:46 +02:00
* @param charset the supposed charset of the document or < code > null < / code > if unkown
2005-05-03 11:47:56 +02:00
* @param source the { @link InputStream } containing the document content
2009-07-08 23:48:08 +02:00
* @return a { @link Document } containing the extracted plain text of the document
2005-05-03 11:47:56 +02:00
* and some additional metadata .
* @throws ParserException if the content could not be parsed properly
*
2009-10-18 02:53:43 +02:00
* @see net . yacy . document . Idiom # parse ( de . anomic . net . URL , java . lang . String , java . io . InputStream )
2005-05-03 11:47:56 +02:00
* /
2009-10-11 02:12:19 +02:00
public abstract Document parse ( DigestURI location , String mimeType , String charset , InputStream source ) throws ParserException , InterruptedException ;
2005-05-17 10:25:04 +02:00
2005-11-29 08:27:58 +01:00
/ * *
* Return the name of the parser
* /
public String getName ( ) {
2006-09-20 14:25:07 +02:00
return this . parserName ;
2005-11-29 08:27:58 +01:00
}
2006-10-03 13:05:48 +02:00
public void reset ( ) {
this . contentLength = - 1 ;
}
2005-05-03 11:47:56 +02:00
}