yacy_search_server/source/net/yacy/crawler/retrieval/SitemapImporter.java

118 lines
4.4 KiB
Java
Raw Normal View History

//SitemapImporter.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2007
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.sitemapParser;
import net.yacy.document.parser.sitemapParser.URLEntry;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
public class SitemapImporter extends Thread {
private CrawlProfile crawlingProfile = null;
private static final ConcurrentLog logger = new ConcurrentLog("SITEMAP");
private DigestURL siteMapURL = null;
private final Switchboard sb;
public SitemapImporter(final Switchboard sb, final DigestURL sitemapURL, final CrawlProfile profileEntry) {
super("SitemapImporter(" + sitemapURL != null ? sitemapURL.toNormalform(false) : "" + ")");
assert sitemapURL != null;
this.sb = sb;
this.siteMapURL = sitemapURL;
assert profileEntry != null;
this.crawlingProfile = profileEntry;
}
@Override
public void run() {
try {
logger.info("Start parsing sitemap file " + this.siteMapURL.toNormalform(true));
sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL, this.crawlingProfile.getAgent());
parser.start();
URLEntry item;
while ((item = parser.take()) != sitemapParser.POISON_URLEntry) {
process(item);
}
} catch (final Exception e) {
logger.warn("Unable to parse sitemap file " + this.siteMapURL, e);
}
}
public void process(sitemapParser.URLEntry entry) {
// get the url hash
byte[] nexturlhash = null;
DigestURL url = null;
try {
url = new DigestURL(entry.url());
nexturlhash = url.hash();
} catch (final MalformedURLException e1) {
}
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
HarvestProcess dbocc;
try {
dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
// URL needs to crawled
this.sb.crawlStacker.enqueueEntry(new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null, // this.siteMapURL.toNormalform(false),
entry.url(),
entry.lastmod(new Date()),
this.crawlingProfile.handle(),
0,
this.crawlingProfile.timezoneOffset()
));
logger.info("New URL '" + entry.url() + "' added for loading.");
}
}