yacy_search_server/htroot/IndexImportMediawiki_p.java
luccioman e97580dfc7 Fixed unsafe conccurent access to generic SimpleDateFormat instances
SimpleDateFormat must not be used by concurrent threads without
synchronization for parsing or formating dates as it is not thread-safe
(internally holds a calendar instance that is not synchronized).

Prefer now DateTimeFormatter when possible as it is thread-safe without
concurrent access performance bottleneck (does not internally use
synchronization locks).
2018-06-28 14:59:23 +02:00

225 lines
10 KiB
Java

// IndexImportMediawiki.java
// -------------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 04.05.2009 on http://yacy.net
// Frankfurt, Germany
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.time.Instant;
import java.util.Date;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.document.importer.MediawikiImporter;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
/**
* Import of MediaWiki dump files in the local index.
*/
public class IndexImportMediawiki_p {
/**
* Run conditions :
* - no MediaWiki import thread is running : allow to start a new import by filling the "file" parameter
* - the MediaWiki import thread is running : returns monitoring information.
* @param header servlet request header
* @param post request parameters. Supported keys :
* <ul>
* <li>file : a dump URL or file path on this YaCy server local file system</li>
* <li>iffresh : when set to true, the dump file is imported only if its last modified date is unknown or after the last import trial date on this same file. </li>
* <li>report : when set, display the currently running thread monitoring info, or the last import report when no one is running.
* Ignored when no import thread is known.</li>
* </ul>
* @param env server environment
* @return the servlet answer object
*/
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
if (MediawikiImporter.job != null && (MediawikiImporter.job.isAlive() || (post != null && post.containsKey("report")))) {
/* one import is running, or report was explicitly requested : no option to insert anything */
prop.put("import", 1);
/* Only refresh automatically when the job is running */
prop.put("refresh", MediawikiImporter.job.isAlive() ? 1 : 0);
final String jobErrorMessage = MediawikiImporter.job.status();
if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) {
prop.put("import_status", 1);
prop.put("import_status_message", jobErrorMessage);
}
prop.put("import_thread", MediawikiImporter.job.isAlive() ? 2 : 0);
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", MediawikiImporter.job.count());
prop.put("import_speed", MediawikiImporter.job.speed());
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
prop.put("refresh", 0);
prop.put("import_prevReport", MediawikiImporter.job != null ? 1 : 0);
if (post == null) {
prop.put("import_status", 0);
/* Acquire a transaction token for the next POST form submission */
final String token = TransactionManager.getTransactionToken(header);
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
} else {
if (post.containsKey("file")) {
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
String file = post.get("file");
MultiProtocolURL sourceURL = null;
int status = 0;
String sourceFilePath = "";
final Row lastExecutedCall = WorkTables.selectLastExecutedApiCall("IndexImportMediawiki_p.html", post, sb);
Date lastExecutionDate = null;
if (lastExecutedCall != null) {
lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
}
try {
sourceURL = new MultiProtocolURL(file);
if(sourceURL.isFile()) {
final File sourcefile = sourceURL.getFSFile();
sourceFilePath = sourcefile.getAbsolutePath();
if (!sourcefile.exists()) {
status = 2;
} else if (!sourcefile.canRead()) {
status = 3;
} else if (sourcefile.isDirectory()) {
status = 4;
}
}
if (status == 0 && post.getBoolean("iffresh")) {
long lastModified = getLastModified(sourceURL);
if (lastExecutionDate != null && lastModified != 0L && Instant.ofEpochMilli(lastModified)
.isBefore(lastExecutionDate.toInstant())) {
status = 5;
prop.put("import_status_lastImportDate", GenericFormatter
.formatSafely(lastExecutionDate.toInstant(), GenericFormatter.FORMAT_SIMPLE));
/* the import is not performed, but we increase here the api call count */
if(sb.tables != null) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if(lastExecutedCallPk != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
}
}
}
} catch (MalformedURLException e) {
status = 1;
}
if (status == 0) {
/* store this call as an api call */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
if(lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if(lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
}
MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_thread", 1);
prop.put("import", 1);
prop.put("refresh", 1);
} else {
prop.put("import_status", status);
prop.put("import_status_sourceFile", sourceFilePath);
/* Acquire a transaction token for the next POST form submission */
final String token = TransactionManager.getTransactionToken(header);
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
}
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
}
}
}
return prop;
}
/**
* @param fileURL the file URL. Must not be null.
* @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred
*/
private static long getLastModified(MultiProtocolURL fileURL) {
long lastModified = 0l;
try {
if (fileURL.isHTTP() || fileURL.isHTTPS()) {
/* http(s) : we do not use MultiprotocolURL.lastModified() which always returns 0L for these protocols */
HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
HttpResponse headResponse = httpClient.HEADResponse(fileURL, false);
if (headResponse != null && headResponse.getStatusLine() != null
&& headResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
Header lastModifiedHeader = headResponse
.getFirstHeader(HeaderFramework.LAST_MODIFIED);
if (lastModifiedHeader != null) {
Date lastModifiedDate = HeaderFramework.parseHTTPDate(lastModifiedHeader.getValue());
if(lastModifiedDate != null) {
lastModified = lastModifiedDate.getTime();
}
}
}
} else {
lastModified = fileURL.lastModified();
}
} catch (IOException ignored) {
ConcurrentLog.warn("IndexImportMediawiki_p", "Could not retrieve last modified date for dump file at " + fileURL);
}
return lastModified;
}
}