2009-07-18 00:03:34 +02:00
package de.anomic.document ;
2011-09-07 12:08:57 +02:00
import static org.junit.Assert.assertThat ;
import static org.junit.matchers.JUnitMatchers.containsString ;
2009-07-18 00:03:34 +02:00
import java.io.File ;
import java.io.FileInputStream ;
2010-06-29 21:20:45 +02:00
import java.io.FileNotFoundException ;
import java.io.IOException ;
2009-07-18 00:03:34 +02:00
import java.io.InputStreamReader ;
2011-09-07 12:08:57 +02:00
import java.io.Reader ;
2010-06-29 21:20:45 +02:00
import java.io.UnsupportedEncodingException ;
import java.net.MalformedURLException ;
2013-05-20 01:50:09 +02:00
import net.yacy.document.AbstractParser ;
2009-07-18 00:03:34 +02:00
2011-09-07 12:08:57 +02:00
import net.yacy.document.Document ;
import net.yacy.document.Parser ;
2013-05-20 01:50:09 +02:00
import net.yacy.document.parser.docParser ;
import net.yacy.document.parser.odtParser ;
import net.yacy.document.parser.ooxmlParser ;
import net.yacy.document.parser.pdfParser ;
2011-09-07 12:08:57 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import org.junit.Test ;
2009-07-18 00:03:34 +02:00
public class ParserTest {
2013-05-20 01:50:09 +02:00
@Test public void testooxmlParsers ( ) throws FileNotFoundException , Parser . Failure , MalformedURLException , UnsupportedEncodingException , IOException {
2011-09-07 12:08:57 +02:00
final String [ ] [ ] testFiles = new String [ ] [ ] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
2009-08-08 17:34:41 +02:00
new String [ ] { " umlaute_windows.docx " , " application/vnd.openxmlformats-officedocument.wordprocessingml.document " , " In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen " , " " , " " } ,
2009-10-16 11:33:18 +02:00
new String [ ] { " umlaute_windows.pptx " , " application/vnd.openxmlformats-officedocument.presentationml.presentation " , " Folie 1 " , " " , " " } ,
2013-05-20 01:50:09 +02:00
} ;
for ( final String [ ] testFile : testFiles ) {
try {
final String filename = " test/parsertest/ " + testFile [ 0 ] ;
final File file = new File ( filename ) ;
final String mimetype = testFile [ 1 ] ;
final DigestURI url = new DigestURI ( " http://localhost/ " + filename ) ;
AbstractParser p = new ooxmlParser ( ) ;
final Document [ ] docs = p . parse ( url , mimetype , null , new FileInputStream ( file ) ) ;
for ( final Document doc : docs ) {
final Reader content = new InputStreamReader ( doc . getTextStream ( ) , doc . getCharset ( ) ) ;
final StringBuilder str = new StringBuilder ( ) ;
int c ;
while ( ( c = content . read ( ) ) ! = - 1 )
str . append ( ( char ) c ) ;
System . out . println ( " Parsed " + filename + " : " + str ) ;
assertThat ( str . toString ( ) , containsString ( " In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen " ) ) ;
assertThat ( doc . dc_title ( ) , containsString ( testFile [ 2 ] ) ) ;
assertThat ( doc . dc_creator ( ) , containsString ( testFile [ 3 ] ) ) ;
2013-09-10 20:05:10 +02:00
assertThat ( doc . dc_description ( ) [ 0 ] , containsString ( testFile [ 4 ] ) ) ;
2013-05-20 01:50:09 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final InterruptedException ex ) { }
2013-05-20 01:50:09 +02:00
}
}
@Test public void testodtParsers ( ) throws FileNotFoundException , Parser . Failure , MalformedURLException , UnsupportedEncodingException , IOException {
final String [ ] [ ] testFiles = new String [ ] [ ] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
2009-07-18 17:04:34 +02:00
new String [ ] { " umlaute_linux.odt " , " application/vnd.oasis.opendocument.text " , " Münchner Hofbräuhaus " , " " , " Kommentar zum Hofbräuhaus " } ,
new String [ ] { " umlaute_linux.ods " , " application/vnd.oasis.opendocument.spreadsheat " , " " , " " , " " } ,
new String [ ] { " umlaute_linux.odp " , " application/vnd.oasis.opendocument.presentation " , " " , " " , " " } ,
2013-05-20 01:50:09 +02:00
} ;
for ( final String [ ] testFile : testFiles ) {
try {
final String filename = " test/parsertest/ " + testFile [ 0 ] ;
final File file = new File ( filename ) ;
final String mimetype = testFile [ 1 ] ;
final DigestURI url = new DigestURI ( " http://localhost/ " + filename ) ;
AbstractParser p = new odtParser ( ) ;
final Document [ ] docs = p . parse ( url , mimetype , null , new FileInputStream ( file ) ) ;
for ( final Document doc : docs ) {
final Reader content = new InputStreamReader ( doc . getTextStream ( ) , doc . getCharset ( ) ) ;
final StringBuilder str = new StringBuilder ( ) ;
int c ;
while ( ( c = content . read ( ) ) ! = - 1 )
str . append ( ( char ) c ) ;
System . out . println ( " Parsed " + filename + " : " + str ) ;
assertThat ( str . toString ( ) , containsString ( " In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen " ) ) ;
// assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat ( doc . dc_creator ( ) , containsString ( testFile [ 3 ] ) ) ;
2013-09-10 20:05:10 +02:00
assertThat ( doc . dc_description ( ) [ 0 ] , containsString ( testFile [ 4 ] ) ) ;
2013-05-20 01:50:09 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final InterruptedException ex ) { }
2013-05-20 01:50:09 +02:00
}
}
@Test public void testpdfParsers ( ) throws FileNotFoundException , Parser . Failure , MalformedURLException , UnsupportedEncodingException , IOException {
final String [ ] [ ] testFiles = new String [ ] [ ] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
2009-07-18 17:04:34 +02:00
new String [ ] { " umlaute_linux.pdf " , " application/pdf " , " " , " " , " " } ,
2009-07-18 00:03:34 +02:00
} ;
2013-05-20 01:50:09 +02:00
for ( final String [ ] testFile : testFiles ) {
try {
final String filename = " test/parsertest/ " + testFile [ 0 ] ;
final File file = new File ( filename ) ;
final String mimetype = testFile [ 1 ] ;
final DigestURI url = new DigestURI ( " http://localhost/ " + filename ) ;
AbstractParser p = new pdfParser ( ) ;
final Document [ ] docs = p . parse ( url , mimetype , null , new FileInputStream ( file ) ) ;
for ( final Document doc : docs ) {
final Reader content = new InputStreamReader ( doc . getTextStream ( ) , doc . getCharset ( ) ) ;
final StringBuilder str = new StringBuilder ( ) ;
int c ;
while ( ( c = content . read ( ) ) ! = - 1 )
str . append ( ( char ) c ) ;
System . out . println ( " Parsed " + filename + " : " + str ) ;
assertThat ( str . toString ( ) , containsString ( " In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen " ) ) ;
assertThat ( doc . dc_title ( ) , containsString ( testFile [ 2 ] ) ) ;
assertThat ( doc . dc_creator ( ) , containsString ( testFile [ 3 ] ) ) ;
2013-09-10 20:05:10 +02:00
assertThat ( doc . dc_description ( ) [ 0 ] , containsString ( testFile [ 4 ] ) ) ;
2013-05-20 01:50:09 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final InterruptedException ex ) { }
2013-05-20 01:50:09 +02:00
}
}
@Test public void testdocParsers ( ) throws FileNotFoundException , Parser . Failure , MalformedURLException , UnsupportedEncodingException , IOException {
final String [ ] [ ] testFiles = new String [ ] [ ] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String [ ] { " umlaute_windows.doc " , " application/msword " , " " , " " , " " } ,
} ;
2009-07-18 00:03:34 +02:00
2011-09-07 12:08:57 +02:00
for ( final String [ ] testFile : testFiles ) {
2013-05-20 01:50:09 +02:00
try {
final String filename = " test/parsertest/ " + testFile [ 0 ] ;
final File file = new File ( filename ) ;
final String mimetype = testFile [ 1 ] ;
final DigestURI url = new DigestURI ( " http://localhost/ " + filename ) ;
AbstractParser p = new docParser ( ) ;
final Document [ ] docs = p . parse ( url , mimetype , null , new FileInputStream ( file ) ) ;
for ( final Document doc : docs ) {
final Reader content = new InputStreamReader ( doc . getTextStream ( ) , doc . getCharset ( ) ) ;
final StringBuilder str = new StringBuilder ( ) ;
int c ;
while ( ( c = content . read ( ) ) ! = - 1 )
str . append ( ( char ) c ) ;
2009-07-18 00:03:34 +02:00
2013-05-20 01:50:09 +02:00
System . out . println ( " Parsed " + filename + " : " + str ) ;
assertThat ( str . toString ( ) , containsString ( " In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen " ) ) ;
assertThat ( doc . dc_title ( ) , containsString ( testFile [ 2 ] ) ) ;
assertThat ( doc . dc_creator ( ) , containsString ( testFile [ 3 ] ) ) ;
2013-09-10 20:05:10 +02:00
assertThat ( doc . dc_description ( ) [ 0 ] , containsString ( testFile [ 4 ] ) ) ;
2013-05-20 01:50:09 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final InterruptedException ex ) { }
2013-05-20 01:50:09 +02:00
}
}
}