mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Patch last_modified date with internal FirstSeenTime() if no date provided
to make sure updated documents are indexed with their last-modified date as provided in current crawl. (to patch moddate always with firstseen might bear the risk of miss actual updates).
This commit is contained in:
parent
d1b23afed6
commit
275d65fffe
|
@ -534,11 +534,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.author, author);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.last_modified)) {
|
||||
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
|
||||
if (lastModified == null) lastModified = new Date();
|
||||
if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified();
|
||||
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
|
||||
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
|
||||
Date lastModified = responseHeader == null ? document.getLastModified() : responseHeader.lastModified();
|
||||
if (lastModified == null) {
|
||||
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
|
||||
if (firstSeen > 0) {
|
||||
lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
|
||||
} else {
|
||||
lastModified = new Date();
|
||||
}
|
||||
}
|
||||
if (document.getLastModified().before(lastModified)) {
|
||||
lastModified = document.getLastModified();
|
||||
}
|
||||
add(doc, CollectionSchema.last_modified, lastModified);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user