added parsing of 'date', 'dc:date', 'dc.date' and 'last-modified' in

html meta fields to get a correct (or: better) date timestamp. The
http:last-modified mostly does not work because it is set to the current
date from most CMS.
This commit is contained in:
Michael Peter Christen 2013-09-10 10:31:57 +02:00
parent 9cc8468b30
commit 35ab2cef7b
29 changed files with 120 additions and 33 deletions

View File

@ -116,7 +116,7 @@ public class searchresult {
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 100000000 : 100));
// set ranking
if (post.containsKey("sort")) {

View File

@ -94,6 +94,7 @@ public class Document {
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date date;
public Document(final DigestURI location, final String mimeType, final String charset,
final Object parserObject,
@ -107,7 +108,8 @@ public class Document {
final Map<DigestURI, Properties> anchors,
final Map<DigestURI, String> rss,
final Map<DigestURI, ImageEntry> images,
final boolean indexingDenied) {
final boolean indexingDenied,
final Date date) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
@ -143,6 +145,7 @@ public class Document {
this.indexingDenied = indexingDenied;
this.text = text == null ? "" : text;
this.generic_facets = new HashMap<String, Set<String>>();
this.date = date == null ? new Date() : date;
}
public Object getParserObject() {
@ -451,6 +454,10 @@ dc_rights
return this.emaillinks;
}
public Date getDate() {
return this.date;
}
public double lon() {
return this.lon;
}
@ -783,6 +790,7 @@ dc_rights
final Map<DigestURI, String> rss = new HashMap<DigestURI, String>();
final Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
double lon = 0.0d, lat = 0.0d;
Date date = new Date();
for (final Document doc: docs) {
@ -821,6 +829,7 @@ dc_rights
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
if (doc.date.before(date)) date = doc.date;
}
// clean up parser data
@ -852,7 +861,8 @@ dc_rights
anchors,
rss,
images,
false);
false,
date);
}
public static Map<DigestURI, String> getHyperlinks(final Document[] documents) {

View File

@ -100,7 +100,9 @@ public class DCEntry extends MultiMapSolrParams {
*/
public Date getDate() {
String d = this.get("docdatetime");
if (d == null) d = this.get("date");
if (d == null) d = this.get("dc:date");
if (d == null) d = this.get("last-modified");
if (d == null) return null;
if (d.isEmpty()) return null;
try {
@ -286,7 +288,8 @@ public class DCEntry extends MultiMapSolrParams {
null,
null,
null,
false);
false,
getDate());
}
public void writeXML(OutputStreamWriter os) throws IOException {

View File

@ -30,6 +30,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -171,7 +172,8 @@ public class audioTagParser extends AbstractParser implements Parser {
null,
null,
null,
false)
false,
new Date())
};
return docs;
} catch (final Exception e) {
@ -193,7 +195,8 @@ public class audioTagParser extends AbstractParser implements Parser {
null,
null,
null,
false
false,
new Date()
)};
} finally {
try {

View File

@ -30,6 +30,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.document.AbstractParser;
@ -77,7 +78,8 @@ public class csvParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
}
private static String concatRow(String[] columns) {

View File

@ -28,6 +28,7 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -103,7 +104,8 @@ public class docParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
return docs;
}

View File

@ -25,6 +25,7 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
@ -65,7 +66,8 @@ public class genericParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
return docs;
}
}

View File

@ -31,7 +31,9 @@ import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
@ -45,6 +47,7 @@ import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.storage.SizeLimitedMap;
@ -848,6 +851,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
return EMPTY_STRING;
}
public Date getDate() {
String content;
// <meta name="date" content="YYYY-MM-DD..." />
content = this.metas.get("date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
// <meta name="DC.date" content="YYYY-MM-DD" />
content = this.metas.get("dc.date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
// <meta name="DC:date" content="YYYY-MM-DD" />
content = this.metas.get("dc:date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
// <meta http-equiv="last-modified" content="YYYY-MM-DD" />
content = this.metas.get("last-modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content);} catch (ParseException e) {}
return new Date();
}
// parse location
// <meta NAME="ICBM" CONTENT="38.90551492, 1.454004505" />

View File

@ -141,7 +141,8 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getAnchors(),
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied());
scraper.indexingDenied(),
scraper.getDate());
ppd.setFavicon(scraper.getFavicon());
return ppd;

View File

@ -35,6 +35,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -221,7 +222,8 @@ public class genericImageParser extends AbstractParser implements Parser {
anchors, // anchors
null,
images,
false)}; // images
false,
new Date())}; // images
}
@Override

View File

@ -27,6 +27,7 @@ package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
@ -116,7 +117,8 @@ public class mmParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
}
private class FreeMindHandler extends DefaultHandler {

View File

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
@ -197,7 +198,9 @@ public class odtParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date()
)};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
@ -182,7 +183,8 @@ public class ooxmlParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -32,6 +32,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
@ -125,6 +126,7 @@ public class pdfParser extends AbstractParser implements Parser {
// extracting some metadata
PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
Date docDate = new Date();
if (info != null) {
docTitle = info.getTitle();
docSubject = info.getSubject();
@ -132,10 +134,9 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher = info.getProducer();
if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
docKeywordStr = info.getKeywords();
try {if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();} catch (IOException e) {}
// unused:
// info.getTrapped());
// info.getCreationDate());
// info.getModificationDate();
}
info = null;
@ -218,7 +219,8 @@ public class pdfParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
docDate)};
}
@SuppressWarnings("static-access")

View File

@ -29,6 +29,7 @@ package net.yacy.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -99,7 +100,8 @@ public class pptParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -34,6 +34,7 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Date;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -115,7 +116,8 @@ public class psParser extends AbstractParser implements Parser {
null, // anchors
null, // rss
null, // images
false)}; // indexingdenied
false, // indexingdenied
new Date())};
return docs;
} catch (final Exception e) {

View File

@ -27,6 +27,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.document.AbstractParser;
@ -59,7 +60,7 @@ public class rdfParser extends AbstractParser implements Parser {
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
docs.add(doc);

View File

@ -13,6 +13,7 @@ import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
@ -80,7 +81,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, null, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, null, null, null, null, false, new Date());
try {
if (allTriples.length > 0)
@ -139,7 +140,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
return doc;
}

View File

@ -102,7 +102,8 @@ public class rssParser extends AbstractParser implements Parser {
anchors,
null,
new HashMap<DigestURI, ImageEntry>(),
false);
false,
item.getPubDate());
docs.add(doc);
} catch (final MalformedURLException e) {
continue;

View File

@ -28,6 +28,7 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
@ -84,7 +85,8 @@ public class rtfParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

View File

@ -32,6 +32,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -72,7 +73,8 @@ public class sevenzipParser extends AbstractParser implements Parser {
null,
null,
null,
false);
false,
new Date());
Handler archive;
AbstractParser.log.fine("opening 7zip archive...");
try {

View File

@ -27,6 +27,7 @@ package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
@ -95,7 +96,8 @@ public class sidAudioParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
}
throw new Parser.Failure("Unable to parse SID file, file does seems to be incomplete (len = " + available + ").", location);
} catch (final IOException ex) {

View File

@ -96,7 +96,8 @@ public class sitemapParser extends AbstractParser implements Parser {
null,
null,
new HashMap<DigestURI, ImageEntry>(),
false);
false,
new Date());
docs.add(doc);
} catch (final MalformedURLException e) {
continue;

View File

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -125,7 +126,8 @@ public class swfParser extends AbstractParser implements Parser {
anchors, // a map of extracted anchors
null,
null,
false)}; // a treeset of image URLs
false,
new Date())}; // a treeset of image URLs
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.List;
import java.util.Map;
@ -110,7 +111,8 @@ public class torrentParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
}
public static void main(String[] args) {

View File

@ -33,6 +33,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
@ -227,7 +228,8 @@ public class vcfParser extends AbstractParser implements Parser {
anchors, // a map of extracted anchors
null,
null, // a treeset of image URLs
false)};
false,
new Date())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

View File

@ -29,6 +29,7 @@ package net.yacy.document.parser;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.yacy.cora.util.ConcurrentLog;
@ -115,7 +116,8 @@ public class vsdParser extends AbstractParser implements Parser {
null, // a map of extracted anchors
null,
null, // a treeset of image URLs
false)};
false,
new Date())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -28,6 +28,7 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -129,7 +130,8 @@ public class xlsParser extends AbstractParser implements Parser {
null,
null,
null,
false)};
false,
new Date())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

View File

@ -441,7 +441,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.author, author);
}
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, new String[]{document.dc_format()});
if (allAttr || contains(CollectionSchema.last_modified)) add(doc, CollectionSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
if (allAttr || contains(CollectionSchema.last_modified)) {
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (document.getDate().before(lastModified)) lastModified = document.getDate();
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
List<String> synonyms = condenser.synonyms();