extract author and keywords in .doc and .ppt parser

This commit is contained in:
reger 2014-06-29 02:54:09 +02:00
parent a5707cd2eb
commit cb2c17d236
2 changed files with 18 additions and 4 deletions

View File

@ -86,6 +86,14 @@ public class docParser extends AbstractParser implements Parser {
if (title.length() == l) break;
l = title.length();
}
// get keywords (for yacy as array)
final String keywords = extractor.getSummaryInformation().getKeywords();
final String[] keywlist;
if (keywords != null && !keywords.isEmpty()) {
keywlist = keywords.split(",");
} else {
keywlist = null;
}
Document[] docs;
docs = new Document[]{new Document(
@ -94,9 +102,9 @@ public class docParser extends AbstractParser implements Parser {
"UTF-8",
this,
null,
null,
keywlist,
singleList(title),
"", // TODO: AUTHOR
extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
extractor.getDocSummaryInformation().getCompany(), // publisher
null,
null,

View File

@ -78,6 +78,12 @@ public class pptParser extends AbstractParser implements Parser {
if (title.length() == l) break;
l = title.length();
}
// get keywords (for yacy as array)
final String keywords = pptExtractor.getSummaryInformation().getKeywords();
final String[] keywlist;
if (keywords != null && !keywords.isEmpty()) {
keywlist = keywords.split(",");
} else keywlist = null;
/*
* create the plasmaParserDocument for the database
@ -89,9 +95,9 @@ public class pptParser extends AbstractParser implements Parser {
"UTF-8",
this,
null,
null,
keywlist,
singleList(title),
"", // TODO: AUTHOR
pptExtractor.getSummaryInformation().getAuthor(), // may be null
pptExtractor.getDocSummaryInformation().getCompany(),
null,
null,