mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- odt & ooxml (office document) parser correction to add content to fulltext index
- adjust Junit yacyVersionTest & ParserTest - update yacyVersion.combined2prettyVersion to the default 4-digit minor ver.
This commit is contained in:
parent
b68fbe7d21
commit
97ab5b90e8
|
@ -131,20 +131,18 @@ public class odtParser extends AbstractParser implements Parser {
|
||||||
if (entryName.equals("content.xml")) {
|
if (entryName.equals("content.xml")) {
|
||||||
|
|
||||||
// create a writer for output
|
// create a writer for output
|
||||||
writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize());
|
writer = new CharBuffer(MAX_DOCSIZE, (int) zipEntry.getSize());
|
||||||
|
|
||||||
|
// extract data
|
||||||
|
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
||||||
try {
|
try {
|
||||||
// extract data
|
final SAXParser saxParser = getParser();
|
||||||
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
|
||||||
try {
|
|
||||||
final SAXParser saxParser = getParser();
|
|
||||||
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
|
|
||||||
} finally {
|
|
||||||
// close readers and writers
|
|
||||||
zipFileEntryStream.close();
|
|
||||||
}
|
|
||||||
} finally {
|
} finally {
|
||||||
writer.close();
|
// close readers and writers
|
||||||
|
zipFileEntryStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (entryName.equals("meta.xml")) {
|
} else if (entryName.equals("meta.xml")) {
|
||||||
// meta.xml contains metadata about the document
|
// meta.xml contains metadata about the document
|
||||||
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
||||||
|
@ -177,7 +175,7 @@ public class odtParser extends AbstractParser implements Parser {
|
||||||
|
|
||||||
// create the parser document
|
// create the parser document
|
||||||
Document[] docs = null;
|
Document[] docs = null;
|
||||||
final byte[] contentBytes = UTF8.getBytes(writer.toString());
|
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
|
||||||
docs = new Document[]{new Document(
|
docs = new Document[]{new Document(
|
||||||
location,
|
location,
|
||||||
mimeType,
|
mimeType,
|
||||||
|
|
|
@ -116,21 +116,19 @@ public class ooxmlParser extends AbstractParser implements Parser {
|
||||||
|| entryName.startsWith("xl/worksheets/sheet")) {
|
|| entryName.startsWith("xl/worksheets/sheet")) {
|
||||||
|
|
||||||
// create a writer for output
|
// create a writer for output
|
||||||
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize());
|
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize());
|
||||||
try {
|
|
||||||
// extract data
|
|
||||||
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
|
||||||
try {
|
|
||||||
final SAXParser saxParser = getParser();
|
|
||||||
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
|
|
||||||
|
|
||||||
// close readers and writers
|
// extract data
|
||||||
} finally {
|
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
||||||
zipFileEntryStream.close();
|
try {
|
||||||
}
|
final SAXParser saxParser = getParser();
|
||||||
|
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
|
||||||
|
|
||||||
|
// close readers and writers
|
||||||
} finally {
|
} finally {
|
||||||
writer.close();
|
zipFileEntryStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (entryName.equals("docProps/core.xml")) {
|
} else if (entryName.equals("docProps/core.xml")) {
|
||||||
// meta.xml contains metadata about the document
|
// meta.xml contains metadata about the document
|
||||||
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
|
||||||
|
@ -162,7 +160,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
|
||||||
|
|
||||||
// create the parser document
|
// create the parser document
|
||||||
Document[] docs = null;
|
Document[] docs = null;
|
||||||
final byte[] contentBytes = UTF8.getBytes(writer.toString());
|
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
|
||||||
docs = new Document[]{new Document(
|
docs = new Document[]{new Document(
|
||||||
location,
|
location,
|
||||||
mimeType,
|
mimeType,
|
||||||
|
|
|
@ -154,7 +154,7 @@ public class yacyVersion implements Comparator<yacyVersion>, Comparable<yacyVers
|
||||||
|
|
||||||
final String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1));
|
final String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1));
|
||||||
String revision = matcher.group(2);
|
String revision = matcher.group(2);
|
||||||
for(int i=revision.length();i<5;++i) revision += "0";
|
for(int i=revision.length();i<4;++i) revision += "0";
|
||||||
return new String[]{mainversion, revision};
|
return new String[]{mainversion, revision};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,10 +11,14 @@ import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
import net.yacy.document.AbstractParser;
|
||||||
|
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
import net.yacy.document.Parser;
|
import net.yacy.document.Parser;
|
||||||
import net.yacy.document.TextParser;
|
import net.yacy.document.parser.docParser;
|
||||||
|
import net.yacy.document.parser.odtParser;
|
||||||
|
import net.yacy.document.parser.ooxmlParser;
|
||||||
|
import net.yacy.document.parser.pdfParser;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -22,40 +26,134 @@ import org.junit.Test;
|
||||||
|
|
||||||
public class ParserTest {
|
public class ParserTest {
|
||||||
|
|
||||||
@Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
|
@Test public void testooxmlParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
|
||||||
final String[][] testFiles = new String[][] {
|
final String[][] testFiles = new String[][] {
|
||||||
// meaning: filename in test/parsertest, mimetype, title, creator, description,
|
// meaning: filename in test/parsertest, mimetype, title, creator, description,
|
||||||
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
|
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
|
||||||
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
|
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (final String[] testFile : testFiles) {
|
||||||
|
try {
|
||||||
|
final String filename = "test/parsertest/" + testFile[0];
|
||||||
|
final File file = new File(filename);
|
||||||
|
final String mimetype = testFile[1];
|
||||||
|
final DigestURI url = new DigestURI("http://localhost/"+filename);
|
||||||
|
|
||||||
|
AbstractParser p = new ooxmlParser();
|
||||||
|
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
|
||||||
|
for (final Document doc: docs) {
|
||||||
|
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
|
||||||
|
final StringBuilder str = new StringBuilder();
|
||||||
|
int c;
|
||||||
|
while( (c = content.read()) != -1 )
|
||||||
|
str.append((char)c);
|
||||||
|
|
||||||
|
System.out.println("Parsed " + filename + ": " + str);
|
||||||
|
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
|
||||||
|
assertThat(doc.dc_title(), containsString(testFile[2]));
|
||||||
|
assertThat(doc.dc_creator(), containsString(testFile[3]));
|
||||||
|
assertThat(doc.dc_description(), containsString(testFile[4]));
|
||||||
|
}
|
||||||
|
} catch (InterruptedException ex) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testodtParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
|
||||||
|
final String[][] testFiles = new String[][] {
|
||||||
|
// meaning: filename in test/parsertest, mimetype, title, creator, description,
|
||||||
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
|
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
|
||||||
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
|
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
|
||||||
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
|
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (final String[] testFile : testFiles) {
|
||||||
|
try {
|
||||||
|
final String filename = "test/parsertest/" + testFile[0];
|
||||||
|
final File file = new File(filename);
|
||||||
|
final String mimetype = testFile[1];
|
||||||
|
final DigestURI url = new DigestURI("http://localhost/"+filename);
|
||||||
|
|
||||||
|
AbstractParser p = new odtParser();
|
||||||
|
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
|
||||||
|
for (final Document doc: docs) {
|
||||||
|
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
|
||||||
|
final StringBuilder str = new StringBuilder();
|
||||||
|
int c;
|
||||||
|
while( (c = content.read()) != -1 )
|
||||||
|
str.append((char)c);
|
||||||
|
|
||||||
|
System.out.println("Parsed " + filename + ": " + str);
|
||||||
|
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
|
||||||
|
// assertThat(doc.dc_title(), containsString(testFile[2]));
|
||||||
|
assertThat(doc.dc_creator(), containsString(testFile[3]));
|
||||||
|
assertThat(doc.dc_description(), containsString(testFile[4]));
|
||||||
|
}
|
||||||
|
} catch (InterruptedException ex) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testpdfParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
|
||||||
|
final String[][] testFiles = new String[][] {
|
||||||
|
// meaning: filename in test/parsertest, mimetype, title, creator, description,
|
||||||
new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
|
new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (final String[] testFile : testFiles) {
|
||||||
|
try {
|
||||||
|
final String filename = "test/parsertest/" + testFile[0];
|
||||||
|
final File file = new File(filename);
|
||||||
|
final String mimetype = testFile[1];
|
||||||
|
final DigestURI url = new DigestURI("http://localhost/"+filename);
|
||||||
|
|
||||||
|
AbstractParser p = new pdfParser();
|
||||||
|
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
|
||||||
|
for (final Document doc: docs) {
|
||||||
|
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
|
||||||
|
final StringBuilder str = new StringBuilder();
|
||||||
|
int c;
|
||||||
|
while( (c = content.read()) != -1 )
|
||||||
|
str.append((char)c);
|
||||||
|
|
||||||
|
System.out.println("Parsed " + filename + ": " + str);
|
||||||
|
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
|
||||||
|
assertThat(doc.dc_title(), containsString(testFile[2]));
|
||||||
|
assertThat(doc.dc_creator(), containsString(testFile[3]));
|
||||||
|
assertThat(doc.dc_description(), containsString(testFile[4]));
|
||||||
|
}
|
||||||
|
} catch (InterruptedException ex) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testdocParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
|
||||||
|
final String[][] testFiles = new String[][] {
|
||||||
|
// meaning: filename in test/parsertest, mimetype, title, creator, description,
|
||||||
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
|
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
for (final String[] testFile : testFiles) {
|
for (final String[] testFile : testFiles) {
|
||||||
final String filename = "test/parsertest/" + testFile[0];
|
try {
|
||||||
final File file = new File(filename);
|
final String filename = "test/parsertest/" + testFile[0];
|
||||||
final String mimetype = testFile[1];
|
final File file = new File(filename);
|
||||||
final DigestURI url = new DigestURI("http://localhost/"+filename);
|
final String mimetype = testFile[1];
|
||||||
|
final DigestURI url = new DigestURI("http://localhost/"+filename);
|
||||||
|
|
||||||
final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file));
|
AbstractParser p = new docParser();
|
||||||
for (final Document doc: docs) {
|
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
|
||||||
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
|
for (final Document doc: docs) {
|
||||||
final StringBuilder str = new StringBuilder();
|
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
|
||||||
int c;
|
final StringBuilder str = new StringBuilder();
|
||||||
while( (c = content.read()) != -1 )
|
int c;
|
||||||
str.append((char)c);
|
while( (c = content.read()) != -1 )
|
||||||
|
str.append((char)c);
|
||||||
|
|
||||||
System.out.println("Parsed " + filename + ": " + str);
|
System.out.println("Parsed " + filename + ": " + str);
|
||||||
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
|
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
|
||||||
assertThat(doc.dc_title(), containsString(testFile[2]));
|
assertThat(doc.dc_title(), containsString(testFile[2]));
|
||||||
assertThat(doc.dc_creator(), containsString(testFile[3]));
|
assertThat(doc.dc_creator(), containsString(testFile[3]));
|
||||||
assertThat(doc.dc_description(), containsString(testFile[4]));
|
assertThat(doc.dc_description(), containsString(testFile[4]));
|
||||||
}
|
}
|
||||||
|
} catch (InterruptedException ex) {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package de.anomic.yacy;
|
||||||
|
|
||||||
import net.yacy.peers.operation.yacyVersion;
|
import net.yacy.peers.operation.yacyVersion;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
import org.junit.Assert;
|
||||||
|
|
||||||
public class yacyVersionTest extends TestCase {
|
public class yacyVersionTest extends TestCase {
|
||||||
|
|
||||||
|
@ -10,26 +11,26 @@ public class yacyVersionTest extends TestCase {
|
||||||
* @author Bost
|
* @author Bost
|
||||||
*/
|
*/
|
||||||
public void testCombinedVersionString2PrettyString() {
|
public void testCombinedVersionString2PrettyString() {
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("")); // not a number
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("")); // not a number
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" ")); // not a number
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" ")); // not a number
|
||||||
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417"));
|
Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
|
||||||
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1000244"));
|
Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1000244"));
|
||||||
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417"));
|
Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.100024400")); // input is too long
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.100024400")); // input is too long
|
||||||
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1090244"));
|
Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1090244"));
|
||||||
assertEquals("0.110/02440", yacyVersion.combined2prettyVersion("0.1100244"));
|
Assert.assertArrayEquals(new String[]{"0.110","0244"}, yacyVersion.combined2prettyVersion("0.1100244"));
|
||||||
assertEquals("0.111/02440", yacyVersion.combined2prettyVersion("0.1110244"));
|
Assert.assertArrayEquals(new String[]{"0.111","0244"}, yacyVersion.combined2prettyVersion("0.1110244"));
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed
|
||||||
assertEquals("0.111/00000", yacyVersion.combined2prettyVersion("0.111")); // was (input is too short)
|
Assert.assertArrayEquals(new String[]{"0.111","0000"}, yacyVersion.combined2prettyVersion("0.111")); // was (input is too short)
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("123456789x")); // not a number
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("123456789x")); // not a number
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point
|
||||||
assertEquals("999.999/99900", yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 5 digits)
|
Assert.assertArrayEquals(new String[]{"999.999","9990"}, yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 4 digits)
|
||||||
assertEquals("0.999/99999", yacyVersion.combined2prettyVersion("0.99999999"));
|
Assert.assertArrayEquals(new String[]{"0.999","99999"}, yacyVersion.combined2prettyVersion("0.99999999"));
|
||||||
assertEquals("99999.004/56789", yacyVersion.combined2prettyVersion("99999.00456789"));
|
Assert.assertArrayEquals(new String[]{"99999.004","56789"}, yacyVersion.combined2prettyVersion("99999.00456789"));
|
||||||
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long
|
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user