- odt & ooxml (office document) parser correction to add content to fulltext index

- adjust Junit yacyVersionTest & ParserTest 
- update yacyVersion.combined2prettyVersion to the default 4-digit minor ver.
This commit is contained in:
reger 2013-05-20 01:50:09 +02:00
parent b68fbe7d21
commit 97ab5b90e8
5 changed files with 164 additions and 69 deletions

View File

@ -131,20 +131,18 @@ public class odtParser extends AbstractParser implements Parser {
if (entryName.equals("content.xml")) { if (entryName.equals("content.xml")) {
// create a writer for output // create a writer for output
writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize()); writer = new CharBuffer(MAX_DOCSIZE, (int) zipEntry.getSize());
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try { try {
// extract data final SAXParser saxParser = getParser();
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
try {
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
// close readers and writers
zipFileEntryStream.close();
}
} finally { } finally {
writer.close(); // close readers and writers
zipFileEntryStream.close();
} }
} else if (entryName.equals("meta.xml")) { } else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document // meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -177,7 +175,7 @@ public class odtParser extends AbstractParser implements Parser {
// create the parser document // create the parser document
Document[] docs = null; Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString()); final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
docs = new Document[]{new Document( docs = new Document[]{new Document(
location, location,
mimeType, mimeType,

View File

@ -116,21 +116,19 @@ public class ooxmlParser extends AbstractParser implements Parser {
|| entryName.startsWith("xl/worksheets/sheet")) { || entryName.startsWith("xl/worksheets/sheet")) {
// create a writer for output // create a writer for output
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize()); writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers // extract data
} finally { final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
zipFileEntryStream.close(); try {
} final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
} finally { } finally {
writer.close(); zipFileEntryStream.close();
} }
} else if (entryName.equals("docProps/core.xml")) { } else if (entryName.equals("docProps/core.xml")) {
// meta.xml contains metadata about the document // meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -162,7 +160,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
// create the parser document // create the parser document
Document[] docs = null; Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString()); final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
docs = new Document[]{new Document( docs = new Document[]{new Document(
location, location,
mimeType, mimeType,

View File

@ -154,7 +154,7 @@ public class yacyVersion implements Comparator<yacyVersion>, Comparable<yacyVers
final String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1)); final String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1));
String revision = matcher.group(2); String revision = matcher.group(2);
for(int i=revision.length();i<5;++i) revision += "0"; for(int i=revision.length();i<4;++i) revision += "0";
return new String[]{mainversion, revision}; return new String[]{mainversion, revision};
} }

View File

@ -11,10 +11,14 @@ import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.parser.docParser;
import net.yacy.document.parser.odtParser;
import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test; import org.junit.Test;
@ -22,40 +26,134 @@ import org.junit.Test;
public class ParserTest { public class ParserTest {
@Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException { @Test public void testooxmlParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] { final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description, // meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""}, new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""}, new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new ooxmlParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testodtParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"}, new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""}, new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""}, new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new odtParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
// assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testpdfParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""}, new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new pdfParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testdocParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""}, new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
}; };
for (final String[] testFile : testFiles) { for (final String[] testFile : testFiles) {
final String filename = "test/parsertest/" + testFile[0]; try {
final File file = new File(filename); final String filename = "test/parsertest/" + testFile[0];
final String mimetype = testFile[1]; final File file = new File(filename);
final DigestURI url = new DigestURI("http://localhost/"+filename); final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file)); AbstractParser p = new docParser();
for (final Document doc: docs) { final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset()); for (final Document doc: docs) {
final StringBuilder str = new StringBuilder(); final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
int c; final StringBuilder str = new StringBuilder();
while( (c = content.read()) != -1 ) int c;
str.append((char)c); while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str); System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2])); assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3])); assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4])); assertThat(doc.dc_description(), containsString(testFile[4]));
} }
} catch (InterruptedException ex) {}
}
} }
} }
}

View File

@ -2,6 +2,7 @@ package de.anomic.yacy;
import net.yacy.peers.operation.yacyVersion; import net.yacy.peers.operation.yacyVersion;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.junit.Assert;
public class yacyVersionTest extends TestCase { public class yacyVersionTest extends TestCase {
@ -10,26 +11,26 @@ public class yacyVersionTest extends TestCase {
* @author Bost * @author Bost
*/ */
public void testCombinedVersionString2PrettyString() { public void testCombinedVersionString2PrettyString() {
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("")); // not a number Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("")); // not a number
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" ")); // not a number Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" ")); // not a number
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417")); Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1000244")); Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1000244"));
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417")); Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.100024400")); // input is too long Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.100024400")); // input is too long
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1090244")); Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1090244"));
assertEquals("0.110/02440", yacyVersion.combined2prettyVersion("0.1100244")); Assert.assertArrayEquals(new String[]{"0.110","0244"}, yacyVersion.combined2prettyVersion("0.1100244"));
assertEquals("0.111/02440", yacyVersion.combined2prettyVersion("0.1110244")); Assert.assertArrayEquals(new String[]{"0.111","0244"}, yacyVersion.combined2prettyVersion("0.1110244"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed
assertEquals("0.111/00000", yacyVersion.combined2prettyVersion("0.111")); // was (input is too short) Assert.assertArrayEquals(new String[]{"0.111","0000"}, yacyVersion.combined2prettyVersion("0.111")); // was (input is too short)
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("123456789x")); // not a number Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("123456789x")); // not a number
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point
assertEquals("999.999/99900", yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 5 digits) Assert.assertArrayEquals(new String[]{"999.999","9990"}, yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 4 digits)
assertEquals("0.999/99999", yacyVersion.combined2prettyVersion("0.99999999")); Assert.assertArrayEquals(new String[]{"0.999","99999"}, yacyVersion.combined2prettyVersion("0.99999999"));
assertEquals("99999.004/56789", yacyVersion.combined2prettyVersion("99999.00456789")); Assert.assertArrayEquals(new String[]{"99999.004","56789"}, yacyVersion.combined2prettyVersion("99999.00456789"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long
} }
} }