make use and activate autodetect charset in Vocabulary input from file

+ revert mistake of empty cn.lng
2024-09-19 00:01:41 +02:00 · 2016-05-22 05:38:26 +02:00 · 2016-05-22 05:38:26 +02:00 · f0d7b93372
commit f0d7b93372
parent 9e94989237
3 changed files with 3689 additions and 11 deletions
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -30,6 +30,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
@ -83,7 +84,7 @@ public class Vocabulary_p {
                    final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
                    final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
                    final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
-                    final String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
+                    String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
                    final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
                    final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
                    final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
@ -95,7 +96,11 @@ public class Vocabulary_p {
                    if (!discoverNot) {
                        if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
                            // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
-                            FileUtils.checkCharset(discoverFromCSVFile, discoverFromCSVCharset, true);
+                            if (discoverFromCSVCharset.equals("autodetect")) {
+                                List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
+                                discoverFromCSVCharset = charsets.get(0);
+                                ConcurrentLog.info("FileUtils", "detected charset: " + discoverFromCSVCharset + " used to read " + discoverFromCSVFile.toString());
+                            }
                            // read file
                            BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
                            String line = null;
@ -304,10 +309,12 @@ public class Vocabulary_p {
        }

        // make charset list for import method selector
-        int c = 0;
+        prop.putHTML("create_charset_" + 0 + "_name", "autodetect");
+        prop.put("create_charset_" + 0 + "_selected", 1);
+        int c = 1;
        for (String cs: Charset.availableCharsets().keySet()) {
            prop.putHTML("create_charset_" + c + "_name", cs);
-            prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
+            prop.put("create_charset_" + c + "_selected", 0);
            c++;
        }
        prop.put("create_charset", c);
--- a/locales/cn.lng
+++ b/locales/cn.lng
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -53,7 +53,6 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 import java.util.zip.GZIPInputStream;
@ -935,10 +934,10 @@ public final class FileUtils {
     * used code from http://jchardet.sourceforge.net/;
     * see also: http://www-archive.mozilla.org/projects/intl/chardet.html
     * @param file
-     * @return a set of probable charsets
+     * @return a list of probable charsets
     * @throws IOException
     */
-    public static Set<String> detectCharset(File file) throws IOException {
+    public static List<String> detectCharset(File file) throws IOException {
        // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
        nsDetector det = new nsDetector(nsPSMDetector.ALL);
        BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));
@ -953,11 +952,11 @@ public final class FileUtils {
            if (!isAscii && !done) done = det.DoIt(buf,len, false);
        }
        det.DataEnd();
-        Set<String> result = new HashSet<>();
+        List<String> result = new ArrayList<>();
        if (isAscii) {
-            result.add("ASCII");
+            result.add(StandardCharsets.US_ASCII.name());
        } else {
-            for (String c: det.getProbableCharsets()) result.add(c);
+            for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
        }

        return result;
@ -976,7 +975,7 @@ public final class FileUtils {
            @Override
            public void run() {
                try {
-                    Set<String> charsets = FileUtils.detectCharset(file);
+                    List<String> charsets = FileUtils.detectCharset(file);
                    if (charsets.contains(givenCharset)) {
                        ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
                    } else {