Allow creation of vocabularies from remote CSV file URLs.

2024-09-19 00:01:41 +02:00 · 2018-02-21 08:41:13 +01:00 · 2018-02-21 08:41:13 +01:00 · 46c9da6428
commit 46c9da6428
parent 17c7a85f18
3 changed files with 124 additions and 65 deletions
--- a/htroot/Vocabulary_p.html
+++ b/htroot/Vocabulary_p.html
@ -111,10 +111,11 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
      <fieldset>
        <legend>Vocabulary Production</legend>
      #(csvFileStatus)#
-        ::<div class="alert alert-danger" role="alert">Please provide a CSV file path.</div>
+        ::<div class="alert alert-danger" role="alert">Please provide a CSV file path or <abbr title="Uniform Resource Locator">URL</abbr>.</div>
    	::<div class="alert alert-danger" role="alert">CSV file not found "#[csvPath]#".</div>
-    	::<div class="alert alert-danger" role="alert">Can not read CSV file "#[csvPath]#".</div>
+    	::<div class="alert alert-danger" role="alert">Can not read CSV file at "#[csvFile]#".</div>
    	::<div class="alert alert-danger" role="alert">CSV file error : you selected a directory ("#[csvPath]#").</div>
+    	::<div class="alert alert-danger" role="alert">CSV file URL is malformed "#[csvUrl]#".</div>
      #(/csvFileStatus)#
      #(vocabWriteError)#
        ::<div class="alert alert-danger" role="alert">Could not write vocabulary file at "#[vocabPath]#".</div>
@ -174,7 +175,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
          </div>
        </div>
        <div class="form-group">
-          <label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path</i></label>
+          <label for="discoverpath" class="col-xs-offset-1 col-sm-offset-0 col-sm-5 col-lg-4 control-label"><i>File Path or <abbr title="Uniform Resource Locator">URL</abbr></i></label>
          <div class="col-xs-offset-1 col-sm-offset-0 col-sm-5">
          	<input type="text" id="discoverpath" name="discoverpath" value="" size="78" maxlength="256" disabled="disabled" class="form-control">
          </div>
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -20,7 +20,6 @@

 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
@ -40,18 +39,22 @@ import java.util.regex.Pattern;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.language.synonyms.SynonymLibrary;
 import net.yacy.cora.lod.vocabulary.DCTerms;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
+import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.StreamResponse;
 import net.yacy.data.TransactionManager;
 import net.yacy.data.WorkTables;
 import net.yacy.document.LibraryProvider;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.util.FileUtils;
+import net.yacy.repository.Blacklist.BlacklistType;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
@ -105,35 +108,54 @@ public class Vocabulary_p {
                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
                    final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");

-                    final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;

                    final Segment segment = sb.index;
                    String t;
                    int csvFileStatus = 0;
                    if (!discoverNot) {
                        if (discoverFromCSV) {
-    						if(discoverFromCSVFile != null) {
-    							final String csvPath = discoverFromCSVFile.getAbsolutePath();
-    							if (!discoverFromCSVFile.exists()) {
-    								csvFileStatus = 2;
-    								prop.put("create_csvFileStatus_csvPath", csvPath);
-    							} else if (!discoverFromCSVFile.canRead()) {
-    								csvFileStatus = 3;
-    								prop.put("create_csvFileStatus_csvPath", csvPath);
-    							} else if (discoverFromCSVFile.isDirectory()) {
-    								csvFileStatus = 4;
-    								prop.put("create_csvFileStatus_csvPath", csvPath);
-    							} else {
-    								try {
-    									handleDiscoverFromCSV(post, table, discoverFromCSVFile);
-    								} catch(final IOException e) {
-    									LOG.warn("Could not read CSV file at " + discoverFromCSVFile, e);
-    									csvFileStatus = 3;
-    									prop.put("create_csvFileStatus_csvPath", csvPath);	
-    								}
-    							}
-    						} else {
+    						if(discoverFromCSVPath.isEmpty()) {
    							csvFileStatus = 1;
+    						} else {
+    							DigestURL csvUrl = null;
+    		                    if(discoverFromCSVPath.contains("://")) {
+    		                    	try {
+    		                    		csvUrl = new DigestURL(discoverFromCSVPath);
+    		                    	} catch(final MalformedURLException e) {
+    									csvFileStatus = 5;
+    									prop.put("create_csvFileStatus_csvUrl", discoverFromCSVPath);	
+    		                    	}
+    		                    } else {
+    		                    	final File discoverFromCSVFile = new File(discoverFromCSVPath);
+    		                    	final String csvPath = discoverFromCSVFile.getAbsolutePath();
+    		                    	if (!discoverFromCSVFile.exists()) {
+    		                    		csvFileStatus = 2;
+    		                    		prop.put("create_csvFileStatus_csvPath", csvPath);
+    		                    	} else if (!discoverFromCSVFile.canRead()) {
+    		                    		csvFileStatus = 3;
+    		                    		prop.put("create_csvFileStatus_csvFile", csvPath);
+    		                    	} else if (discoverFromCSVFile.isDirectory()) {
+    		                    		csvFileStatus = 4;
+    		                    		prop.put("create_csvFileStatus_csvPath", csvPath);
+    		                    	} else {
+        		                    	try {
+        		                    		csvUrl = new DigestURL(discoverFromCSVFile);
+        		                    	} catch(final MalformedURLException e) {
+        									csvFileStatus = 5;
+        									prop.put("create_csvFileStatus_csvUrl", "file://" + discoverFromCSVFile.getAbsolutePath());	
+        		                    	}
+    		                    	}
+    		                    }
+    		                    
+    		                    if(csvUrl != null) {
+    		                    	try {
+    		                    		handleDiscoverFromCSV(sb, post, table, csvUrl);
+    		                    	} catch(final IOException e) {
+    		                    		LOG.warn("Could not read CSV file at " + csvUrl, e);
+    		                    		csvFileStatus = 3;
+    		                    		prop.put("create_csvFileStatus_csvFile", csvUrl.toString());	
+    		                    	}
+    		                    }
    						}
                        } else {
                            Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
@ -432,15 +454,16 @@ public class Vocabulary_p {

    /**
     * Fill the vocabulary table from a CSV file.
+     * @param sb the main Switchbaord instance. Must not be null.
     * @param post current request parameters. Must not be null.
     * @param table the vocabulary table to fill. Must not be null.
-     * @param discoverFromCSVFile. Must not be null.
+     * @param csvFileUrl the file URL. Must not be null.
     * @throws IOException when a read/write error occurred
     * @throws UnsupportedEncodingException
     * @throws FileNotFoundException when the file does not exists or can not be read for some reason.
     */
-	protected static void handleDiscoverFromCSV(final serverObjects post, final Map<String, Tagging.SOTuple> table,
-			final File discoverFromCSVFile)
+	protected static void handleDiscoverFromCSV(final Switchboard sb, final serverObjects post, final Map<String, Tagging.SOTuple> table,
+			final DigestURL csvFileUrl)
 			throws IOException, UnsupportedEncodingException, FileNotFoundException {
 		String charsetName = post.get("charset", StandardCharsets.UTF_8.name());
 		final String columnSeparator = post.get("columnSeparator", ";");
@ -451,22 +474,53 @@ public class Vocabulary_p {
 		final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
        final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms");
        final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn");
-		
-		// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
-		if (charsetName.equals("autodetect")) {
-		    List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
-		    charsetName = charsets.get(0);
-		    ConcurrentLog.info("FileUtils", "detected charset: " + charsetName + " used to read " + discoverFromCSVFile.toString());
-		}
+        
 	    final Pattern separatorPattern = Pattern.compile(columnSeparator);
 	    
-		// read file (try-with-resource to close resources automatically)
-		try (final FileInputStream fileStream = new FileInputStream(discoverFromCSVFile);
-				final InputStreamReader reader = new InputStreamReader(fileStream, charsetName);
-				final BufferedReader bufferedReader = new BufferedReader(reader);) {
-			discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
-					discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
-					bufferedReader);
+		// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
+		if (charsetName.equals("autodetect")) {
+
+			try (final StreamResponse streamResponse = sb.loader.openInputStream(
+					sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
+					ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
+				if(streamResponse == null || streamResponse.getContentStream() == null) {
+					throw new IOException("Could not get CSV content at " + csvFileUrl);
+				}
+				
+				charsetName = streamResponse.getResponse().getCharacterEncoding();
+				
+				if(charsetName == null) {
+					/* Charset not provided in response headers : try to detect it from content */
+					final List<String> charsets = FileUtils.detectCharset(streamResponse.getContentStream());
+					charsetName = charsets.get(0);
+					LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
+				} else {
+					LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString());
+					/* Use now the open stream */
+					try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
+							final BufferedReader bufferedReader = new BufferedReader(reader);) {
+						discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
+								discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
+								bufferedReader);
+					}
+					return;
+				}
+			}
+		}
+		
+		// when autodetection of content charset has been selected, a remote resource may opened again, but has some chances to be now in cache
+		try(final StreamResponse streamResponse = sb.loader.openInputStream(
+				sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER,
+				ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) {
+			if(streamResponse == null || streamResponse.getContentStream() == null) {
+				throw new IOException("Could not get CSV content at " + csvFileUrl);
+			}
+			try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName);
+					final BufferedReader bufferedReader = new BufferedReader(reader);) {
+				discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms,
+						discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern,
+						bufferedReader);
+			}
 		}
 	}

--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -1028,32 +1028,35 @@ public final class FileUtils {
    }
    
    /**
-     * auto-detect the charset of a file
-     * used code from http://jchardet.sourceforge.net/;
-     * see also: http://www-archive.mozilla.org/projects/intl/chardet.html
-     * @param file
+     * Auto-detect the charset of content in a stream.
+     * Used code from http://jchardet.sourceforge.net/.
+     * Don't forget to close the stream in caller.
+     * @see <a href="http://www-archive.mozilla.org/projects/intl/chardet.html">chardet</a>
+     * @param inStream an open stream
     * @return a list of probable charsets
-     * @throws IOException
+     * @throws IOException when a read error occured
     */
-    public static List<String> detectCharset(File file) throws IOException {
+    public static List<String> detectCharset(final InputStream inStream) throws IOException {
        // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
        List<String> result;
-        try (BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file))) { // try-with-resource to close inputstream
-            nsDetector det = new nsDetector(nsPSMDetector.ALL);
-            byte[] buf = new byte[1024] ;
-            int len;
-            boolean done = false ;
-            boolean isAscii = true ;
-            while ((len = imp.read(buf,0,buf.length)) != -1) {
-                if (isAscii) isAscii = det.isAscii(buf,len);
-                if (!isAscii && !done) done = det.DoIt(buf,len, false);
-            }   det.DataEnd();
-            result = new ArrayList<>();
+        nsDetector det = new nsDetector(nsPSMDetector.ALL);
+        byte[] buf = new byte[1024] ;
+        int len;
+        boolean done = false ;
+        boolean isAscii = true ;
+        while ((len = inStream.read(buf,0,buf.length)) != -1) {
            if (isAscii) {
-                result.add(StandardCharsets.US_ASCII.name());
-            } else {
-                for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
+            	isAscii = det.isAscii(buf,len);
            }
+            if (!isAscii && !done) {
+            	done = det.DoIt(buf,len, false);
+            }
+        }   det.DataEnd();
+        result = new ArrayList<>();
+        if (isAscii) {
+            result.add(StandardCharsets.US_ASCII.name());
+        } else {
+            for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
        }
        return result;
    }
@ -1070,8 +1073,9 @@ public final class FileUtils {
        Thread t = new Thread("FileUtils.checkCharset") {
            @Override
            public void run() {
-                try {
-                    List<String> charsets = FileUtils.detectCharset(file);
+            	try (final FileInputStream fileStream = new FileInputStream(file); 
+            			final BufferedInputStream imp = new BufferedInputStream(fileStream)) { // try-with-resource to close resources
+                    List<String> charsets = FileUtils.detectCharset(imp);
                    if (charsets.contains(givenCharset)) {
                        ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
                    } else {