make use and activate autodetect charset in Vocabulary input from file

+ revert mistake of empty cn.lng
This commit is contained in:
reger 2016-05-22 05:38:26 +02:00
parent 9e94989237
commit f0d7b93372
3 changed files with 3689 additions and 11 deletions

View File

@ -30,6 +30,7 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@ -83,7 +84,7 @@ public class Vocabulary_p {
final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
final String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
@ -95,7 +96,11 @@ public class Vocabulary_p {
if (!discoverNot) {
if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
FileUtils.checkCharset(discoverFromCSVFile, discoverFromCSVCharset, true);
if (discoverFromCSVCharset.equals("autodetect")) {
List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
discoverFromCSVCharset = charsets.get(0);
ConcurrentLog.info("FileUtils", "detected charset: " + discoverFromCSVCharset + " used to read " + discoverFromCSVFile.toString());
}
// read file
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
String line = null;
@ -304,10 +309,12 @@ public class Vocabulary_p {
}
// make charset list for import method selector
int c = 0;
prop.putHTML("create_charset_" + 0 + "_name", "autodetect");
prop.put("create_charset_" + 0 + "_selected", 1);
int c = 1;
for (String cs: Charset.availableCharsets().keySet()) {
prop.putHTML("create_charset_" + c + "_name", cs);
prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
prop.put("create_charset_" + c + "_selected", 0);
c++;
}
prop.put("create_charset", c);

File diff suppressed because it is too large Load Diff

View File

@ -53,7 +53,6 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
@ -935,10 +934,10 @@ public final class FileUtils {
* used code from http://jchardet.sourceforge.net/;
* see also: http://www-archive.mozilla.org/projects/intl/chardet.html
* @param file
* @return a set of probable charsets
* @return a list of probable charsets
* @throws IOException
*/
public static Set<String> detectCharset(File file) throws IOException {
public static List<String> detectCharset(File file) throws IOException {
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
nsDetector det = new nsDetector(nsPSMDetector.ALL);
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));
@ -953,11 +952,11 @@ public final class FileUtils {
if (!isAscii && !done) done = det.DoIt(buf,len, false);
}
det.DataEnd();
Set<String> result = new HashSet<>();
List<String> result = new ArrayList<>();
if (isAscii) {
result.add("ASCII");
result.add(StandardCharsets.US_ASCII.name());
} else {
for (String c: det.getProbableCharsets()) result.add(c);
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
}
return result;
@ -976,7 +975,7 @@ public final class FileUtils {
@Override
public void run() {
try {
Set<String> charsets = FileUtils.detectCharset(file);
List<String> charsets = FileUtils.detectCharset(file);
if (charsets.contains(givenCharset)) {
ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
} else {