mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Allow creation of vocabularies from remote CSV file URLs.
This commit is contained in:
parent
17c7a85f18
commit
46c9da6428
|
@ -111,10 +111,11 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
|
|||
<fieldset>
|
||||
<legend>Vocabulary Production</legend>
|
||||
#(csvFileStatus)#
|
||||
::<div class="alert alert-danger" role="alert">Please provide a CSV file path.</div>
|
||||
::<div class="alert alert-danger" role="alert">Please provide a CSV file path or <abbr title="Uniform Resource Locator">URL</abbr>.</div>
|
||||
::<div class="alert alert-danger" role="alert">CSV file not found "#[csvPath]#".</div>
|
||||
::<div class="alert alert-danger" role="alert">Can not read CSV file "#[csvPath]#".</div>
|
||||
::<div class="alert alert-danger" role="alert">Can not read CSV file at "#[csvFile]#".</div>
|
||||
::<div class="alert alert-danger" role="alert">CSV file error : you selected a directory ("#[csvPath]#").</div>
|
||||
::<div class="alert alert-danger" role="alert">CSV file URL is malformed "#[csvUrl]#".</div>
|
||||
#(/csvFileStatus)#
|
||||
#(vocabWriteError)#
|
||||
::<div class="alert alert-danger" role="alert">Could not write vocabulary file at "#[vocabPath]#".</div>
|
||||
|
@ -174,7 +175,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
|
|||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path</i></label>
|
||||
<label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path or <abbr title="Uniform Resource Locator">URL</abbr></i></label>
|
||||
<div class="col-xs-offset-1 col-sm-offset-0 col-sm-5">
|
||||
<input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled" class="form-control">
|
||||
</div>
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
@ -40,18 +39,22 @@ import java.util.regex.Pattern;
|
|||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.language.synonyms.SynonymLibrary;
|
||||
import net.yacy.cora.lod.vocabulary.DCTerms;
|
||||
import net.yacy.cora.lod.vocabulary.Tagging;
|
||||
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.retrieval.StreamResponse;
|
||||
import net.yacy.data.TransactionManager;
|
||||
import net.yacy.data.WorkTables;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.SwitchboardConstants;
|
||||
import net.yacy.search.index.Segment;
|
||||
|
@ -105,35 +108,54 @@ public class Vocabulary_p {
|
|||
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
|
||||
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
|
||||
|
||||
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
|
||||
|
||||
final Segment segment = sb.index;
|
||||
String t;
|
||||
int csvFileStatus = 0;
|
||||
if (!discoverNot) {
|
||||
if (discoverFromCSV) {
|
||||
if(discoverFromCSVFile != null) {
|
||||
final String csvPath = discoverFromCSVFile.getAbsolutePath();
|
||||
if (!discoverFromCSVFile.exists()) {
|
||||
csvFileStatus = 2;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
} else if (!discoverFromCSVFile.canRead()) {
|
||||
csvFileStatus = 3;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
} else if (discoverFromCSVFile.isDirectory()) {
|
||||
csvFileStatus = 4;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
} else {
|
||||
try {
|
||||
handleDiscoverFromCSV(post, table, discoverFromCSVFile);
|
||||
} catch(final IOException e) {
|
||||
LOG.warn("Could not read CSV file at " + discoverFromCSVFile, e);
|
||||
csvFileStatus = 3;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(discoverFromCSVPath.isEmpty()) {
|
||||
csvFileStatus = 1;
|
||||
} else {
|
||||
DigestURL csvUrl = null;
|
||||
if(discoverFromCSVPath.contains("://")) {
|
||||
try {
|
||||
csvUrl = new DigestURL(discoverFromCSVPath);
|
||||
} catch(final MalformedURLException e) {
|
||||
csvFileStatus = 5;
|
||||
prop.put("create_csvFileStatus_csvUrl", discoverFromCSVPath);
|
||||
}
|
||||
} else {
|
||||
final File discoverFromCSVFile = new File(discoverFromCSVPath);
|
||||
final String csvPath = discoverFromCSVFile.getAbsolutePath();
|
||||
if (!discoverFromCSVFile.exists()) {
|
||||
csvFileStatus = 2;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
} else if (!discoverFromCSVFile.canRead()) {
|
||||
csvFileStatus = 3;
|
||||
prop.put("create_csvFileStatus_csvFile", csvPath);
|
||||
} else if (discoverFromCSVFile.isDirectory()) {
|
||||
csvFileStatus = 4;
|
||||
prop.put("create_csvFileStatus_csvPath", csvPath);
|
||||
} else {
|
||||
try {
|
||||
csvUrl = new DigestURL(discoverFromCSVFile);
|
||||
} catch(final MalformedURLException e) {
|
||||
csvFileStatus = 5;
|
||||
prop.put("create_csvFileStatus_csvUrl", "file://" + discoverFromCSVFile.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(csvUrl != null) {
|
||||
try {
|
||||
handleDiscoverFromCSV(sb, post, table, csvUrl);
|
||||
} catch(final IOException e) {
|
||||
LOG.warn("Could not read CSV file at " + csvUrl, e);
|
||||
csvFileStatus = 3;
|
||||
prop.put("create_csvFileStatus_csvFile", csvUrl.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
|
||||
|
@ -432,15 +454,16 @@ public class Vocabulary_p {
|
|||
|
||||
/**
|
||||
* Fill the vocabulary table from a CSV file.
|
||||
* @param sb the main Switchbaord instance. Must not be null.
|
||||
* @param post current request parameters. Must not be null.
|
||||
* @param table the vocabulary table to fill. Must not be null.
|
||||
* @param discoverFromCSVFile. Must not be null.
|
||||
* @param csvFileUrl the file URL. Must not be null.
|
||||
* @throws IOException when a read/write error occurred
|
||||
* @throws UnsupportedEncodingException
|
||||
* @throws FileNotFoundException when the file does not exists or can not be read for some reason.
|
||||
*/
|
||||
protected static void handleDiscoverFromCSV(final serverObjects post, final Map<String, Tagging.SOTuple> table,
|
||||
final File discoverFromCSVFile)
|
||||
protected static void handleDiscoverFromCSV(final Switchboard sb, final serverObjects post, final Map<String, Tagging.SOTuple> table,
|
||||
final DigestURL csvFileUrl)
|
||||
throws IOException, UnsupportedEncodingException, FileNotFoundException {
|
||||
String charsetName = post.get("charset", StandardCharsets.UTF_8.name());
|
||||
final String columnSeparator = post.get("columnSeparator", ";");
|
||||
|
@ -451,22 +474,53 @@ public class Vocabulary_p {
|
|||
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
|
||||
final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
|
||||
final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
|
||||
|
||||
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
if (charsetName.equals("autodetect")) {
|
||||
List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
|
||||
charsetName = charsets.get(0);
|
||||
ConcurrentLog.info("FileUtils", "detected charset: " + charsetName + " used to read " + discoverFromCSVFile.toString());
|
||||
}
|
||||
|
||||
final Pattern separatorPattern = Pattern.compile(columnSeparator);
|
||||
|
||||
// read file (try-with-resource to close resources automatically)
|
||||
try (final FileInputStream fileStream = new FileInputStream(discoverFromCSVFile);
|
||||
final InputStreamReader reader = new InputStreamReader(fileStream, charsetName);
|
||||
final BufferedReader bufferedReader = new BufferedReader(reader);) {
|
||||
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
|
||||
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
|
||||
bufferedReader);
|
||||
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
if (charsetName.equals("autodetect")) {
|
||||
|
||||
try (final StreamResponse streamResponse = sb.loader.openInputStream(
|
||||
sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
|
||||
ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
|
||||
if(streamResponse == null || streamResponse.getContentStream() == null) {
|
||||
throw new IOException("Could not get CSV content at " + csvFileUrl);
|
||||
}
|
||||
|
||||
charsetName = streamResponse.getResponse().getCharacterEncoding();
|
||||
|
||||
if(charsetName == null) {
|
||||
/* Charset not provided in response headers : try to detect it from content */
|
||||
final List<String> charsets = FileUtils.detectCharset(streamResponse.getContentStream());
|
||||
charsetName = charsets.get(0);
|
||||
LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
|
||||
} else {
|
||||
LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
|
||||
/* Use now the open stream */
|
||||
try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
|
||||
final BufferedReader bufferedReader = new BufferedReader(reader);) {
|
||||
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
|
||||
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
|
||||
bufferedReader);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// when autodetection of content charset has been selected, a remote resource may opened again, but has some chances to be now in cache
|
||||
try(final StreamResponse streamResponse = sb.loader.openInputStream(
|
||||
sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
|
||||
ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
|
||||
if(streamResponse == null || streamResponse.getContentStream() == null) {
|
||||
throw new IOException("Could not get CSV content at " + csvFileUrl);
|
||||
}
|
||||
try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
|
||||
final BufferedReader bufferedReader = new BufferedReader(reader);) {
|
||||
discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
|
||||
discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
|
||||
bufferedReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1028,32 +1028,35 @@ public final class FileUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* auto-detect the charset of a file
|
||||
* used code from http://jchardet.sourceforge.net/;
|
||||
* see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
* @param file
|
||||
* Auto-detect the charset of content in a stream.
|
||||
* Used code from http://jchardet.sourceforge.net/.
|
||||
* Don't forget to close the stream in caller.
|
||||
* @see <a href="http://www-archive.mozilla.org/projects/intl/chardet.html">chardet</a>
|
||||
* @param inStream an open stream
|
||||
* @return a list of probable charsets
|
||||
* @throws IOException
|
||||
* @throws IOException when a read error occured
|
||||
*/
|
||||
public static List<String> detectCharset(File file) throws IOException {
|
||||
public static List<String> detectCharset(final InputStream inStream) throws IOException {
|
||||
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
List<String> result;
|
||||
try (BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file))) { // try-with-resource to close inputstream
|
||||
nsDetector det = new nsDetector(nsPSMDetector.ALL);
|
||||
byte[] buf = new byte[1024] ;
|
||||
int len;
|
||||
boolean done = false ;
|
||||
boolean isAscii = true ;
|
||||
while ((len = imp.read(buf,0,buf.length)) != -1) {
|
||||
if (isAscii) isAscii = det.isAscii(buf,len);
|
||||
if (!isAscii && !done) done = det.DoIt(buf,len, false);
|
||||
} det.DataEnd();
|
||||
result = new ArrayList<>();
|
||||
nsDetector det = new nsDetector(nsPSMDetector.ALL);
|
||||
byte[] buf = new byte[1024] ;
|
||||
int len;
|
||||
boolean done = false ;
|
||||
boolean isAscii = true ;
|
||||
while ((len = inStream.read(buf,0,buf.length)) != -1) {
|
||||
if (isAscii) {
|
||||
result.add(StandardCharsets.US_ASCII.name());
|
||||
} else {
|
||||
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
|
||||
isAscii = det.isAscii(buf,len);
|
||||
}
|
||||
if (!isAscii && !done) {
|
||||
done = det.DoIt(buf,len, false);
|
||||
}
|
||||
} det.DataEnd();
|
||||
result = new ArrayList<>();
|
||||
if (isAscii) {
|
||||
result.add(StandardCharsets.US_ASCII.name());
|
||||
} else {
|
||||
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -1070,8 +1073,9 @@ public final class FileUtils {
|
|||
Thread t = new Thread("FileUtils.checkCharset") {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
List<String> charsets = FileUtils.detectCharset(file);
|
||||
try (final FileInputStream fileStream = new FileInputStream(file);
|
||||
final BufferedInputStream imp = new BufferedInputStream(fileStream)) { // try-with-resource to close resources
|
||||
List<String> charsets = FileUtils.detectCharset(imp);
|
||||
if (charsets.contains(givenCharset)) {
|
||||
ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue
Block a user