mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
make use and activate autodetect charset in Vocabulary input from file
+ revert mistake of empty cn.lng
This commit is contained in:
parent
9e94989237
commit
f0d7b93372
|
@ -30,6 +30,7 @@ import java.util.Collection;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -83,7 +84,7 @@ public class Vocabulary_p {
|
|||
final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
|
||||
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
|
||||
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
|
||||
final String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
|
||||
String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
|
||||
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
|
||||
final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
|
||||
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
|
||||
|
@ -95,7 +96,11 @@ public class Vocabulary_p {
|
|||
if (!discoverNot) {
|
||||
if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
|
||||
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
FileUtils.checkCharset(discoverFromCSVFile, discoverFromCSVCharset, true);
|
||||
if (discoverFromCSVCharset.equals("autodetect")) {
|
||||
List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
|
||||
discoverFromCSVCharset = charsets.get(0);
|
||||
ConcurrentLog.info("FileUtils", "detected charset: " + discoverFromCSVCharset + " used to read " + discoverFromCSVFile.toString());
|
||||
}
|
||||
// read file
|
||||
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
|
||||
String line = null;
|
||||
|
@ -304,10 +309,12 @@ public class Vocabulary_p {
|
|||
}
|
||||
|
||||
// make charset list for import method selector
|
||||
int c = 0;
|
||||
prop.putHTML("create_charset_" + 0 + "_name", "autodetect");
|
||||
prop.put("create_charset_" + 0 + "_selected", 1);
|
||||
int c = 1;
|
||||
for (String cs: Charset.availableCharsets().keySet()) {
|
||||
prop.putHTML("create_charset_" + c + "_name", cs);
|
||||
prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0);
|
||||
prop.put("create_charset_" + c + "_selected", 0);
|
||||
c++;
|
||||
}
|
||||
prop.put("create_charset", c);
|
||||
|
|
3672
locales/cn.lng
3672
locales/cn.lng
File diff suppressed because it is too large
Load Diff
|
@ -53,7 +53,6 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
@ -935,10 +934,10 @@ public final class FileUtils {
|
|||
* used code from http://jchardet.sourceforge.net/;
|
||||
* see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
* @param file
|
||||
* @return a set of probable charsets
|
||||
* @return a list of probable charsets
|
||||
* @throws IOException
|
||||
*/
|
||||
public static Set<String> detectCharset(File file) throws IOException {
|
||||
public static List<String> detectCharset(File file) throws IOException {
|
||||
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
|
||||
nsDetector det = new nsDetector(nsPSMDetector.ALL);
|
||||
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));
|
||||
|
@ -953,11 +952,11 @@ public final class FileUtils {
|
|||
if (!isAscii && !done) done = det.DoIt(buf,len, false);
|
||||
}
|
||||
det.DataEnd();
|
||||
Set<String> result = new HashSet<>();
|
||||
List<String> result = new ArrayList<>();
|
||||
if (isAscii) {
|
||||
result.add("ASCII");
|
||||
result.add(StandardCharsets.US_ASCII.name());
|
||||
} else {
|
||||
for (String c: det.getProbableCharsets()) result.add(c);
|
||||
for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -976,7 +975,7 @@ public final class FileUtils {
|
|||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
Set<String> charsets = FileUtils.detectCharset(file);
|
||||
List<String> charsets = FileUtils.detectCharset(file);
|
||||
if (charsets.contains(givenCharset)) {
|
||||
ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue
Block a user