mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added language detection using metadata from documents: html and odt documents provide this information
- metadata and results from statistical analysis are compared and result is printed out as debug lines - added ranking profile for wanted language - added class with ISO 639 table, a list of all valid country codes that will be used for the language identification git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5187 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
3768a1bd32
commit
bfcf9b7aa3
|
@ -75,6 +75,7 @@ public class Ranking_p {
|
|||
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text");
|
||||
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title");
|
||||
rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank");
|
||||
rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language");
|
||||
}
|
||||
|
||||
private static serverObjects defaultValues() {
|
||||
|
|
|
@ -29,6 +29,7 @@ package xml.util;
|
|||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.crawler.HTTPLoader;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
|
@ -103,8 +104,9 @@ public class getpageinfo_p {
|
|||
prop.put("tags", count);
|
||||
// put description
|
||||
prop.putHTML("desc", scraper.getDescription(), true);
|
||||
// put language
|
||||
prop.putHTML("lang", scraper.getContentLanguages()[0], true);
|
||||
// put language
|
||||
Set<String> languages = scraper.getContentLanguages();
|
||||
prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true);
|
||||
|
||||
} catch (final MalformedURLException e) { /* ignore this */
|
||||
} catch (final IOException e) { /* ignore this */
|
||||
|
|
|
@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader;
|
|||
import de.anomic.server.serverCharBuffer;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
import de.anomic.tools.iso639;
|
||||
|
||||
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
|
||||
|
||||
|
@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
return s;
|
||||
}
|
||||
|
||||
public String[] getContentLanguages() {
|
||||
public HashSet<String> getContentLanguages() {
|
||||
String s = metas.get("content-language");
|
||||
if (s == null) s = metas.get("dc.language");
|
||||
if (s == null) s = "";
|
||||
return s.split(" |,");
|
||||
if (s == null) return null;
|
||||
HashSet<String> hs = new HashSet<String>();
|
||||
String[] cl = s.split(" |,");
|
||||
int p;
|
||||
for (int i = 0; i < cl.length; i++) {
|
||||
cl[i] = cl[i].toLowerCase();
|
||||
p = cl[i].indexOf('-');
|
||||
if (p > 0) cl[i] = cl[i].substring(0, p);
|
||||
if (iso639.exists(cl[i])) hs.add(cl[i]);
|
||||
}
|
||||
if (hs.size() == 0) return null;
|
||||
return hs;
|
||||
}
|
||||
|
||||
public String[] getKeywords() {
|
||||
|
|
|
@ -118,6 +118,7 @@ public final class indexContainerHeap {
|
|||
int urlCount = 0;
|
||||
synchronized (cache) {
|
||||
for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) {
|
||||
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
|
||||
if (container == null) break;
|
||||
cache.put(container.getWordHash(), container);
|
||||
urlCount += container.size();
|
||||
|
@ -252,6 +253,10 @@ public final class indexContainerHeap {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return an index container
|
||||
* because they may get very large, it is wise to deallocate some memory before calling next()
|
||||
*/
|
||||
public indexContainer next() {
|
||||
final indexContainer n = this.nextContainer;
|
||||
this.nextContainer = next0();
|
||||
|
|
|
@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
replaceAll("\n"," ").
|
||||
|
|
|
@ -32,7 +32,9 @@ import java.io.OutputStreamWriter;
|
|||
import java.io.Writer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
|
@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
String docShortTitle = null;
|
||||
String docLongTitle = null;
|
||||
String docAuthor = null;
|
||||
String docLanguage = null;
|
||||
|
||||
// opening the file as zip file
|
||||
final ZipFile zipFile= new ZipFile(dest);
|
||||
|
@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
docShortTitle = metaData.getTitle();
|
||||
docLongTitle = metaData.getSubject();
|
||||
docAuthor = metaData.getCreator();
|
||||
docLanguage = metaData.getLanguage();
|
||||
}
|
||||
}
|
||||
|
||||
// make the languages set
|
||||
Set<String> languages = new HashSet<String>(1);
|
||||
if (docLanguage != null) languages.add(docLanguage);
|
||||
|
||||
// if there is no title availabe we generate one
|
||||
if (docLongTitle == null) {
|
||||
if (docShortTitle != null) {
|
||||
|
@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
languages,
|
||||
docKeywords,
|
||||
docLongTitle,
|
||||
docAuthor,
|
||||
|
@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
languages,
|
||||
docKeywords,
|
||||
docLongTitle,
|
||||
docAuthor,
|
||||
|
|
|
@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
docKeywords,
|
||||
(docTitle == null) ? docSubject : docTitle,
|
||||
docAuthor,
|
||||
|
@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
docKeywords,
|
||||
(docTitle == null) ? docSubject : docTitle,
|
||||
docAuthor,
|
||||
|
|
|
@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
replaceAll("\n"," ").
|
||||
|
|
|
@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser {
|
|||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
"",
|
||||
null,
|
||||
null,
|
||||
|
|
|
@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
summary,
|
||||
packager,
|
||||
null,
|
||||
|
|
|
@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
feedTitle,
|
||||
(authors.length() > 0)?authors.toString(1,authors.length()):"",
|
||||
feedSections.toArray(new String[feedSections.size()]),
|
||||
|
|
|
@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
replaceAll("\n"," ").
|
||||
|
|
|
@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
|
|||
|
||||
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset,
|
||||
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
|
||||
final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
|
||||
final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null);
|
||||
Handler archive;
|
||||
super.theLogger.logFine("opening 7zip archive...");
|
||||
try {
|
||||
|
|
|
@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser {
|
|||
location, // url of the source document
|
||||
mimeType, // the documents mime type
|
||||
"UTF-8", // charset of the document text
|
||||
null,
|
||||
null, //keywords
|
||||
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
|
|
|
@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
null,
|
||||
null,
|
||||
docKeywords.toString().split(" |,"),
|
||||
docLongTitle.toString(),
|
||||
"", // TODO: AUTHOR
|
||||
|
@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
null,
|
||||
null,
|
||||
docKeywords.toString().split(" |,"),
|
||||
docLongTitle.toString(),
|
||||
"", // TODO: AUTHOR
|
||||
|
|
|
@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
return SUPPORTED_MIME_TYPES;
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
try {
|
||||
final StringBuffer parsedTitle = new StringBuffer();
|
||||
|
@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
|
||||
} else {
|
||||
if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" +
|
||||
"\n\tURL: " + location +
|
||||
"\n\tURL: " + url +
|
||||
"\n\tLine: " + line +
|
||||
"\n\tLine-Nr: " + lineNr);
|
||||
}
|
||||
|
@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
|
||||
final byte[] text = parsedDataText.toString().getBytes();
|
||||
final plasmaParserDocument theDoc = new plasmaParserDocument(
|
||||
location, // url of the source document
|
||||
url, // url of the source document
|
||||
mimeType, // the documents mime type
|
||||
null,
|
||||
null, // a list of extracted keywords
|
||||
null, // the language
|
||||
parsedTitle.toString(), // a long document title
|
||||
"", // TODO: AUTHOR
|
||||
sections, // an array of section headlines
|
||||
|
@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
if (e instanceof ParserException) throw (ParserException) e;
|
||||
|
||||
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
|
||||
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
|
|||
mimeType,
|
||||
"UTF-8",
|
||||
null,
|
||||
null,
|
||||
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
replaceAll("\n"," ").
|
||||
|
|
|
@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
null,
|
||||
null,
|
||||
docKeywords.toString().split(" |,"),
|
||||
docLongTitle.toString(),
|
||||
"", // TODO: AUTHOR
|
||||
|
@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
location,
|
||||
mimeType,
|
||||
null,
|
||||
null,
|
||||
docKeywords.toString().split(" |,"),
|
||||
docLongTitle.toString(),
|
||||
"", // TODO: AUTHOR
|
||||
|
|
|
@ -727,6 +727,7 @@ public final class plasmaParser {
|
|||
location,
|
||||
mimeType,
|
||||
charSet,
|
||||
scraper.getContentLanguages(),
|
||||
scraper.getKeywords(),
|
||||
scraper.getTitle(),
|
||||
scraper.getAuthor(),
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
|
@ -67,8 +68,9 @@ public class plasmaParserDocument {
|
|||
private boolean resorted;
|
||||
private InputStream textStream;
|
||||
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
|
||||
private Set<String> languages;
|
||||
|
||||
protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
|
||||
protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
|
||||
final String[] keywords, final String title, final String author,
|
||||
final String[] sections, final String abstrct,
|
||||
final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
|
||||
|
@ -90,6 +92,7 @@ public class plasmaParserDocument {
|
|||
this.resorted = false;
|
||||
this.inboundLinks = -1;
|
||||
this.outboundLinks = -1;
|
||||
this.languages = languages;
|
||||
|
||||
if (text == null) try {
|
||||
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
|
||||
|
@ -101,31 +104,48 @@ public class plasmaParserDocument {
|
|||
}
|
||||
}
|
||||
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) {
|
||||
this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
|
||||
this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
|
||||
}
|
||||
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
|
||||
final String[] keywords, final String title, final String author,
|
||||
final String[] sections, final String abstrct,
|
||||
final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
|
||||
final String[] keywords, final String title, final String author,
|
||||
final String[] sections, final String abstrct,
|
||||
final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
|
||||
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
|
||||
final String[] keywords, final String title, final String author,
|
||||
final String[] sections, final String abstrct,
|
||||
final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
/**
|
||||
* compute a set of languages that this document contains
|
||||
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
|
||||
* if there are several languages defined in the document, the TLD is taken to check which one should be picked
|
||||
* If there is no metadata at all, null is returned
|
||||
* @return a string with a language name using the alpha-2 code of ISO 639
|
||||
*/
|
||||
public String languageByMetadata() {
|
||||
if (this.languages == null) return null;
|
||||
if (this.languages.size() == 0) return null;
|
||||
if (this.languages.size() == 1) return languages.iterator().next();
|
||||
if (this.languages.contains(this.source.language())) return this.source.language();
|
||||
// now we are confused: the declared languages differ all from the TLD
|
||||
// just pick one of the languages that we have
|
||||
return languages.iterator().next();
|
||||
}
|
||||
|
||||
/*
|
||||
DC according to rfc 5013
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile {
|
|||
public static final String CATHASVIDEO = "cathasvideo";
|
||||
public static final String CATHASAPP = "cathasapp";
|
||||
public static final String TERMFREQUENCY = "tf";
|
||||
public static final String LANGUAGE = "language"; // ranking of preferred language
|
||||
|
||||
// post-sort predicates
|
||||
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
|
||||
|
@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile {
|
|||
coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph,
|
||||
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
|
||||
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
|
||||
coeff_termfrequency;
|
||||
coeff_termfrequency, coeff_language;
|
||||
|
||||
public plasmaSearchRankingProfile(final int mediatype) {
|
||||
// set default-values
|
||||
|
@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile {
|
|||
coeff_urlcompintoplist = 3;
|
||||
coeff_descrcompintoplist = 2;
|
||||
coeff_prefer = 14;
|
||||
coeff_language = 13;
|
||||
}
|
||||
|
||||
public plasmaSearchRankingProfile(final String prefix, final String profile) {
|
||||
|
@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile {
|
|||
coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
|
||||
coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
|
||||
coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
|
||||
coeff_language = parseMap(coeff, LANGUAGE, coeff_language);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile {
|
|||
ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
|
||||
ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
|
||||
ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
|
||||
ext.put(prefix + LANGUAGE, Integer.toString(coeff_language));
|
||||
return ext;
|
||||
}
|
||||
|
||||
|
|
|
@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI {
|
|||
final yacyURL referrerURL = entry.referrerURL();
|
||||
final Date docDate = entry.getModificationDate();
|
||||
String language = condenser.language();
|
||||
String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
|
||||
if (language == null) {
|
||||
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD");
|
||||
language = entry.url().language();
|
||||
language = (bymetadata == null) ? entry.url().language() : bymetadata;
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
|
||||
} else {
|
||||
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language);
|
||||
if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed
|
||||
if (language.equals("pl")) {
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
|
||||
language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
|
||||
} else {
|
||||
if (bymetadata == null) {
|
||||
if (language.equals(entry.url().language()))
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
|
||||
else {
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")");
|
||||
language = entry.url().language();
|
||||
}
|
||||
} else {
|
||||
if (language.equals(bymetadata))
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
|
||||
else
|
||||
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// create a new loaded URL db entry
|
||||
|
|
197
source/de/anomic/tools/iso639.java
Executable file
197
source/de/anomic/tools/iso639.java
Executable file
|
@ -0,0 +1,197 @@
|
|||
// iso639.java
|
||||
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 19.09.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.tools;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
public class iso639 {
|
||||
|
||||
static final String[] codes = {
|
||||
"aa-Afar",
|
||||
"ab-Abkhazian",
|
||||
"af-Afrikaans",
|
||||
"am-Amharic",
|
||||
"ar-Arabic",
|
||||
"as-Assamese",
|
||||
"ay-Aymara",
|
||||
"az-Azerbaijani",
|
||||
"ba-Bashkir",
|
||||
"be-Byelorussian",
|
||||
"bg-Bulgarian",
|
||||
"bh-Bihari",
|
||||
"bi-Bislama",
|
||||
"bn-Bengali;-Bangla",
|
||||
"bo-Tibetan",
|
||||
"br-Breton",
|
||||
"ca-Catalan",
|
||||
"co-Corsican",
|
||||
"cs-Czech",
|
||||
"cy-Welsh",
|
||||
"da-Danish",
|
||||
"de-German",
|
||||
"dz-Bhutani",
|
||||
"el-Greek",
|
||||
"en-English",
|
||||
"eo-Esperanto",
|
||||
"es-Spanish",
|
||||
"et-Estonian",
|
||||
"eu-Basque",
|
||||
"fa-Persian",
|
||||
"fi-Finnish",
|
||||
"fj-Fiji",
|
||||
"fo-Faeroese",
|
||||
"fr-French",
|
||||
"fy-Frisian",
|
||||
"ga-Irish",
|
||||
"gd-Scots-Gaelic",
|
||||
"gl-Galician",
|
||||
"gn-Guarani",
|
||||
"gu-Gujarati",
|
||||
"ha-Hausa",
|
||||
"hi-Hindi",
|
||||
"hr-Croatian",
|
||||
"hu-Hungarian",
|
||||
"hy-Armenian",
|
||||
"ia-Interlingua",
|
||||
"ie-Interlingue",
|
||||
"ik-Inupiak",
|
||||
"in-Indonesian",
|
||||
"is-Icelandic",
|
||||
"it-Italian",
|
||||
"iw-Hebrew",
|
||||
"ja-Japanese",
|
||||
"ji-Yiddish",
|
||||
"jw-Javanese",
|
||||
"ka-Georgian",
|
||||
"kk-Kazakh",
|
||||
"kl-Greenlandic",
|
||||
"km-Cambodian",
|
||||
"kn-Kannada",
|
||||
"ko-Korean",
|
||||
"ks-Kashmiri",
|
||||
"ku-Kurdish",
|
||||
"ky-Kirghiz",
|
||||
"la-Latin",
|
||||
"ln-Lingala",
|
||||
"lo-Laothian",
|
||||
"lt-Lithuanian",
|
||||
"lv-Latvian,-Lettish",
|
||||
"mg-Malagasy",
|
||||
"mi-Maori",
|
||||
"mk-Macedonian",
|
||||
"ml-Malayalam",
|
||||
"mn-Mongolian",
|
||||
"mo-Moldavian",
|
||||
"mr-Marathi",
|
||||
"ms-Malay",
|
||||
"mt-Maltese",
|
||||
"my-Burmese",
|
||||
"na-Nauru",
|
||||
"ne-Nepali",
|
||||
"nl-Dutch",
|
||||
"no-Norwegian",
|
||||
"oc-Occitan",
|
||||
"om-(Afan)-Oromo",
|
||||
"or-Oriya",
|
||||
"pa-Punjabi",
|
||||
"pl-Polish",
|
||||
"ps-Pashto,-Pushto",
|
||||
"pt-Portuguese",
|
||||
"qu-Quechua",
|
||||
"rm-Rhaeto-Romance",
|
||||
"rn-Kirundi",
|
||||
"ro-Romanian",
|
||||
"ru-Russian",
|
||||
"rw-Kinyarwanda",
|
||||
"sa-Sanskrit",
|
||||
"sd-Sindhi",
|
||||
"sg-Sangro",
|
||||
"sh-Serbo-Croatian",
|
||||
"si-Singhalese",
|
||||
"sk-Slovak",
|
||||
"sl-Slovenian",
|
||||
"sm-Samoan",
|
||||
"sn-Shona",
|
||||
"so-Somali",
|
||||
"sq-Albanian",
|
||||
"sr-Serbian",
|
||||
"ss-Siswati",
|
||||
"st-Sesotho",
|
||||
"su-Sundanese",
|
||||
"sv-Swedish",
|
||||
"sw-Swahili",
|
||||
"ta-Tamil",
|
||||
"te-Tegulu",
|
||||
"tg-Tajik",
|
||||
"th-Thai",
|
||||
"ti-Tigrinya",
|
||||
"tk-Turkmen",
|
||||
"tl-Tagalog",
|
||||
"tn-Setswana",
|
||||
"to-Tonga",
|
||||
"tr-Turkish",
|
||||
"ts-Tsonga",
|
||||
"tt-Tatar",
|
||||
"tw-Twi",
|
||||
"uk-Ukrainian",
|
||||
"ur-Urdu",
|
||||
"uz-Uzbek",
|
||||
"vi-Vietnamese",
|
||||
"vo-Volapuk",
|
||||
"wo-Wolof",
|
||||
"xh-Xhosa",
|
||||
"yo-Yoruba",
|
||||
"zh-Chinese",
|
||||
"zu-Zulu"};
|
||||
|
||||
static HashMap<String, String> mapping = new HashMap<String, String>();
|
||||
|
||||
static {
|
||||
for (int i = 0; i < codes.length; i++) {
|
||||
mapping.put(codes[i].substring(0, 2), codes[i].substring(3));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get the name of the alpha-2 country code
|
||||
* @param code, the mnemonic of the country in alpha-2
|
||||
* @return the name of the country
|
||||
*/
|
||||
public static final String country(String code) {
|
||||
return mapping.get(code.toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* see if the given country in alpha-2 country code exists
|
||||
* @param code, the mnemonic of the country in alpha-2
|
||||
* @return true if the code exists
|
||||
*/
|
||||
public static final boolean exists(String code) {
|
||||
return mapping.containsKey(code.toLowerCase());
|
||||
}
|
||||
|
||||
}
|
|
@ -848,7 +848,7 @@ public class yacyURL implements Serializable {
|
|||
|
||||
// language calculation
|
||||
public String language() {
|
||||
String language = "uk";
|
||||
String language = "en";
|
||||
final int pos = host.lastIndexOf(".");
|
||||
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
|
||||
return language;
|
||||
|
|
Loading…
Reference in New Issue
Block a user