Allow creation of vocabularies from remote CSV file URLs.

This commit is contained in:
luccioman 2018-02-21 08:41:13 +01:00
parent 17c7a85f18
commit 46c9da6428
3 changed files with 124 additions and 65 deletions

View File

@ -111,10 +111,11 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<fieldset>
<legend>Vocabulary Production</legend>
#(csvFileStatus)#
::<div class="alert alert-danger" role="alert">Please provide a CSV file path.</div>
::<div class="alert alert-danger" role="alert">Please provide a CSV file path or <abbr title="Uniform Resource Locator">URL</abbr>.</div>
::<div class="alert alert-danger" role="alert">CSV file not found "#[csvPath]#".</div>
::<div class="alert alert-danger" role="alert">Can not read CSV file "#[csvPath]#".</div>
::<div class="alert alert-danger" role="alert">Can not read CSV file at "#[csvFile]#".</div>
::<div class="alert alert-danger" role="alert">CSV file error : you selected a directory ("#[csvPath]#").</div>
::<div class="alert alert-danger" role="alert">CSV file URL is malformed "#[csvUrl]#".</div>
#(/csvFileStatus)#
#(vocabWriteError)#
::<div class="alert alert-danger" role="alert">Could not write vocabulary file at "#[vocabPath]#".</div>
@ -174,7 +175,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
</div>
</div>
<div class="form-group">
<label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path</i></label>
<label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path or <abbr title="Uniform Resource Locator">URL</abbr></i></label>
<div class="col-xs-offset-1 col-sm-offset-0 col-sm-5">
<input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled" class="form-control">
</div>

View File

@ -20,7 +20,6 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
@ -40,18 +39,22 @@ import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.StreamResponse;
import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
@ -105,35 +108,54 @@ public class Vocabulary_p {
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
final Segment segment = sb.index;
String t;
int csvFileStatus = 0;
if (!discoverNot) {
if (discoverFromCSV) {
if(discoverFromCSVFile != null) {
final String csvPath = discoverFromCSVFile.getAbsolutePath();
if (!discoverFromCSVFile.exists()) {
csvFileStatus = 2;
prop.put("create_csvFileStatus_csvPath", csvPath);
} else if (!discoverFromCSVFile.canRead()) {
csvFileStatus = 3;
prop.put("create_csvFileStatus_csvPath", csvPath);
} else if (discoverFromCSVFile.isDirectory()) {
csvFileStatus = 4;
prop.put("create_csvFileStatus_csvPath", csvPath);
} else {
try {
handleDiscoverFromCSV(post, table, discoverFromCSVFile);
} catch(final IOException e) {
LOG.warn("Could not read CSV file at " + discoverFromCSVFile, e);
csvFileStatus = 3;
prop.put("create_csvFileStatus_csvPath", csvPath);
}
}
} else {
if(discoverFromCSVPath.isEmpty()) {
csvFileStatus = 1;
} else {
DigestURL csvUrl = null;
if(discoverFromCSVPath.contains("://")) {
try {
csvUrl = new DigestURL(discoverFromCSVPath);
} catch(final MalformedURLException e) {
csvFileStatus = 5;
prop.put("create_csvFileStatus_csvUrl", discoverFromCSVPath);
}
} else {
final File discoverFromCSVFile = new File(discoverFromCSVPath);
final String csvPath = discoverFromCSVFile.getAbsolutePath();
if (!discoverFromCSVFile.exists()) {
csvFileStatus = 2;
prop.put("create_csvFileStatus_csvPath", csvPath);
} else if (!discoverFromCSVFile.canRead()) {
csvFileStatus = 3;
prop.put("create_csvFileStatus_csvFile", csvPath);
} else if (discoverFromCSVFile.isDirectory()) {
csvFileStatus = 4;
prop.put("create_csvFileStatus_csvPath", csvPath);
} else {
try {
csvUrl = new DigestURL(discoverFromCSVFile);
} catch(final MalformedURLException e) {
csvFileStatus = 5;
prop.put("create_csvFileStatus_csvUrl", "file://" + discoverFromCSVFile.getAbsolutePath());
}
}
}
if(csvUrl != null) {
try {
handleDiscoverFromCSV(sb, post, table, csvUrl);
} catch(final IOException e) {
LOG.warn("Could not read CSV file at " + csvUrl, e);
csvFileStatus = 3;
prop.put("create_csvFileStatus_csvFile", csvUrl.toString());
}
}
}
} else {
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
@ -432,15 +454,16 @@ public class Vocabulary_p {
/**
* Fill the vocabulary table from a CSV file.
* @param sb the main Switchbaord instance. Must not be null.
* @param post current request parameters. Must not be null.
* @param table the vocabulary table to fill. Must not be null.
* @param discoverFromCSVFile. Must not be null.
* @param csvFileUrl the file URL. Must not be null.
* @throws IOException when a read/write error occurred
* @throws UnsupportedEncodingException
* @throws FileNotFoundException when the file does not exists or can not be read for some reason.
*/
protected static void handleDiscoverFromCSV(final serverObjects post, final Map<String, Tagging.SOTuple> table,
final File discoverFromCSVFile)
protected static void handleDiscoverFromCSV(final Switchboard sb, final serverObjects post, final Map<String, Tagging.SOTuple> table,
final DigestURL csvFileUrl)
throws IOException, UnsupportedEncodingException, FileNotFoundException {
String charsetName = post.get("charset", StandardCharsets.UTF_8.name());
final String columnSeparator = post.get("columnSeparator", ";");
@ -451,22 +474,53 @@ public class Vocabulary_p {
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
if (charsetName.equals("autodetect")) {
List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
charsetName = charsets.get(0);
ConcurrentLog.info("FileUtils", "detected charset: " + charsetName + " used to read " + discoverFromCSVFile.toString());
}
final Pattern separatorPattern = Pattern.compile(columnSeparator);
// read file (try-with-resource to close resources automatically)
try (final FileInputStream fileStream = new FileInputStream(discoverFromCSVFile);
final InputStreamReader reader = new InputStreamReader(fileStream, charsetName);
final BufferedReader bufferedReader = new BufferedReader(reader);) {
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
bufferedReader);
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
if (charsetName.equals("autodetect")) {
try (final StreamResponse streamResponse = sb.loader.openInputStream(
sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
if(streamResponse == null || streamResponse.getContentStream() == null) {
throw new IOException("Could not get CSV content at " + csvFileUrl);
}
charsetName = streamResponse.getResponse().getCharacterEncoding();
if(charsetName == null) {
/* Charset not provided in response headers : try to detect it from content */
final List<String> charsets = FileUtils.detectCharset(streamResponse.getContentStream());
charsetName = charsets.get(0);
LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
} else {
LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
/* Use now the open stream */
try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
final BufferedReader bufferedReader = new BufferedReader(reader);) {
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
bufferedReader);
}
return;
}
}
}
// when autodetection of content charset has been selected, a remote resource may opened again, but has some chances to be now in cache
try(final StreamResponse streamResponse = sb.loader.openInputStream(
sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
if(streamResponse == null || streamResponse.getContentStream() == null) {
throw new IOException("Could not get CSV content at " + csvFileUrl);
}
try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
final BufferedReader bufferedReader = new BufferedReader(reader);) {
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
bufferedReader);
}
}
}

View File

@ -1028,32 +1028,35 @@ public final class FileUtils {
}
/**
* auto-detect the charset of a file
* used code from http://jchardet.sourceforge.net/;
* see also: http://www-archive.mozilla.org/projects/intl/chardet.html
* @param file
* Auto-detect the charset of content in a stream.
* Used code from http://jchardet.sourceforge.net/.
* Don't forget to close the stream in caller.
* @see <a href="http://www-archive.mozilla.org/projects/intl/chardet.html">chardet</a>
* @param inStream an open stream
* @return a list of probable charsets
* @throws IOException
* @throws IOException when a read error occured
*/
public static List<String> detectCharset(File file) throws IOException {
public static List<String> detectCharset(final InputStream inStream) throws IOException {
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
List<String> result;
try (BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file))) { // try-with-resource to close inputstream
nsDetector det = new nsDetector(nsPSMDetector.ALL);
byte[] buf = new byte[1024] ;
int len;
boolean done = false ;
boolean isAscii = true ;
while ((len = imp.read(buf,0,buf.length)) != -1) {
if (isAscii) isAscii = det.isAscii(buf,len);
if (!isAscii && !done) done = det.DoIt(buf,len, false);
} det.DataEnd();
result = new ArrayList<>();
nsDetector det = new nsDetector(nsPSMDetector.ALL);
byte[] buf = new byte[1024] ;
int len;
boolean done = false ;
boolean isAscii = true ;
while ((len = inStream.read(buf,0,buf.length)) != -1) {
if (isAscii) {
result.add(StandardCharsets.US_ASCII.name());
} else {
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
isAscii = det.isAscii(buf,len);
}
if (!isAscii && !done) {
done = det.DoIt(buf,len, false);
}
} det.DataEnd();
result = new ArrayList<>();
if (isAscii) {
result.add(StandardCharsets.US_ASCII.name());
} else {
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
}
return result;
}
@ -1070,8 +1073,9 @@ public final class FileUtils {
Thread t = new Thread("FileUtils.checkCharset") {
@Override
public void run() {
try {
List<String> charsets = FileUtils.detectCharset(file);
try (final FileInputStream fileStream = new FileInputStream(file);
final BufferedInputStream imp = new BufferedInputStream(fileStream)) { // try-with-resource to close resources
List<String> charsets = FileUtils.detectCharset(imp);
if (charsets.contains(givenCharset)) {
ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
} else {