- added language detection using metadata from documents: html and odt documents provide this information

- metadata and results from statistical analysis are compared and result is printed out as debug lines
- added ranking profile for wanted language
- added class with ISO 639 table, a list of all valid country codes that will be used for the language identification

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5187 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2008-09-19 22:19:11 +00:00
parent 3768a1bd32
commit bfcf9b7aa3
24 changed files with 308 additions and 25 deletions

View File

@ -75,6 +75,7 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTEXT, "Words In Text");
rankingParameters.put(plasmaSearchRankingProfile.WORDSINTITLE, "Words In Title");
rankingParameters.put(plasmaSearchRankingProfile.YBR, "YaCy Block Rank");
rankingParameters.put(plasmaSearchRankingProfile.LANGUAGE, "Preferred Language");
}
private static serverObjects defaultValues() {

View File

@ -29,6 +29,7 @@ package xml.util;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Set;
import de.anomic.crawler.HTTPLoader;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -103,8 +104,9 @@ public class getpageinfo_p {
prop.put("tags", count);
// put description
prop.putHTML("desc", scraper.getDescription(), true);
// put language
prop.putHTML("lang", scraper.getContentLanguages()[0], true);
// put language
Set<String> languages = scraper.getContentLanguages();
prop.putHTML("lang", (languages == null) ? "unknown" : languages.iterator().next(), true);
} catch (final MalformedURLException e) { /* ignore this */
} catch (final IOException e) { /* ignore this */

View File

@ -50,6 +50,7 @@ import de.anomic.http.httpRequestHeader;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL;
import de.anomic.tools.iso639;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -381,11 +382,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return s;
}
public String[] getContentLanguages() {
public HashSet<String> getContentLanguages() {
String s = metas.get("content-language");
if (s == null) s = metas.get("dc.language");
if (s == null) s = "";
return s.split(" |,");
if (s == null) return null;
HashSet<String> hs = new HashSet<String>();
String[] cl = s.split(" |,");
int p;
for (int i = 0; i < cl.length; i++) {
cl[i] = cl[i].toLowerCase();
p = cl[i].indexOf('-');
if (p > 0) cl[i] = cl[i].substring(0, p);
if (iso639.exists(cl[i])) hs.add(cl[i]);
}
if (hs.size() == 0) return null;
return hs;
}
public String[] getKeywords() {

View File

@ -118,6 +118,7 @@ public final class indexContainerHeap {
int urlCount = 0;
synchronized (cache) {
for (final indexContainer container : new heapFileEntries(heapFile, this.payloadrow)) {
// TODO: in this loop a lot of memory may be allocated. A check if the memory gets low is necessary. But what do when the memory is low?
if (container == null) break;
cache.put(container.getWordHash(), container);
urlCount += container.size();
@ -252,6 +253,10 @@ public final class indexContainerHeap {
}
}
/**
* return an index container
* because they may get very large, it is wise to deallocate some memory before calling next()
*/
public indexContainer next() {
final indexContainer n = this.nextContainer;
this.nextContainer = next0();

View File

@ -70,6 +70,7 @@ public class docParser extends AbstractParser implements Parser {
mimeType,
"UTF-8",
null,
null,
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").

View File

@ -32,7 +32,9 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -89,6 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
String docShortTitle = null;
String docLongTitle = null;
String docAuthor = null;
String docLanguage = null;
// opening the file as zip file
final ZipFile zipFile= new ZipFile(dest);
@ -134,9 +137,14 @@ public class odtParser extends AbstractParser implements Parser {
docShortTitle = metaData.getTitle();
docLongTitle = metaData.getSubject();
docAuthor = metaData.getCreator();
docLanguage = metaData.getLanguage();
}
}
// make the languages set
Set<String> languages = new HashSet<String>(1);
if (docLanguage != null) languages.add(docLanguage);
// if there is no title availabe we generate one
if (docLongTitle == null) {
if (docShortTitle != null) {
@ -156,6 +164,7 @@ public class odtParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
languages,
docKeywords,
docLongTitle,
docAuthor,
@ -169,6 +178,7 @@ public class odtParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
languages,
docKeywords,
docLongTitle,
docAuthor,

View File

@ -143,6 +143,7 @@ public class pdfParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,
@ -156,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,

View File

@ -88,6 +88,7 @@ public class pptParser extends AbstractParser implements Parser {
mimeType,
"UTF-8",
null,
null,
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").

View File

@ -117,6 +117,7 @@ public class psParser extends AbstractParser implements Parser {
"UTF-8",
null,
null,
null,
"",
null,
null,

View File

@ -134,6 +134,7 @@ public class rpmParser extends AbstractParser implements Parser {
mimeType,
"UTF-8",
null,
null,
summary,
packager,
null,

View File

@ -158,6 +158,7 @@ public class rssParser extends AbstractParser implements Parser {
mimeType,
"UTF-8",
null,
null,
feedTitle,
(authors.length() > 0)?authors.toString(1,authors.length()):"",
feedSections.toArray(new String[feedSections.size()]),

View File

@ -75,6 +75,7 @@ public class rtfParser extends AbstractParser implements Parser {
mimeType,
"UTF-8",
null,
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").

View File

@ -63,7 +63,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset,
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
final plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset, null);
Handler archive;
super.theLogger.logFine("opening 7zip archive...");
try {

View File

@ -107,6 +107,7 @@ public class swfParser extends AbstractParser implements Parser {
location, // url of the source document
mimeType, // the documents mime type
"UTF-8", // charset of the document text
null,
null, //keywords
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").

View File

@ -188,6 +188,7 @@ public class tarParser extends AbstractParser implements Parser {
location,
mimeType,
null,
null,
docKeywords.toString().split(" |,"),
docLongTitle.toString(),
"", // TODO: AUTHOR
@ -201,6 +202,7 @@ public class tarParser extends AbstractParser implements Parser {
location,
mimeType,
null,
null,
docKeywords.toString().split(" |,"),
docLongTitle.toString(),
"", // TODO: AUTHOR

View File

@ -78,7 +78,7 @@ public class vcfParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try {
final StringBuffer parsedTitle = new StringBuffer();
@ -213,7 +213,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else {
if (theLogger.isFinest()) this.theLogger.logFinest("Invalid data in vcf file" +
"\n\tURL: " + location +
"\n\tURL: " + url +
"\n\tLine: " + line +
"\n\tLine-Nr: " + lineNr);
}
@ -222,10 +222,11 @@ public class vcfParser extends AbstractParser implements Parser {
final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
final byte[] text = parsedDataText.toString().getBytes();
final plasmaParserDocument theDoc = new plasmaParserDocument(
location, // url of the source document
url, // url of the source document
mimeType, // the documents mime type
null,
null, // a list of extracted keywords
null, // the language
parsedTitle.toString(), // a long document title
"", // TODO: AUTHOR
sections, // an array of section headlines
@ -238,7 +239,7 @@ public class vcfParser extends AbstractParser implements Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),url);
}
}

View File

@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
mimeType,
"UTF-8",
null,
null,
((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").

View File

@ -172,6 +172,7 @@ public class zipParser extends AbstractParser implements Parser {
location,
mimeType,
null,
null,
docKeywords.toString().split(" |,"),
docLongTitle.toString(),
"", // TODO: AUTHOR
@ -185,6 +186,7 @@ public class zipParser extends AbstractParser implements Parser {
location,
mimeType,
null,
null,
docKeywords.toString().split(" |,"),
docLongTitle.toString(),
"", // TODO: AUTHOR

View File

@ -727,6 +727,7 @@ public final class plasmaParser {
location,
mimeType,
charSet,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),

View File

@ -36,6 +36,7 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -67,8 +68,9 @@ public class plasmaParserDocument {
private boolean resorted;
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
protected plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final Object text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
@ -90,6 +92,7 @@ public class plasmaParserDocument {
this.resorted = false;
this.inboundLinks = -1;
this.outboundLinks = -1;
this.languages = languages;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -101,31 +104,48 @@ public class plasmaParserDocument {
}
}
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset) {
this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
}
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final File text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset,
public plasmaParserDocument(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
* if there are several languages defined in the document, the TLD is taken to check which one should be picked
* If there is no metadata at all, null is returned
* @return a string with a language name using the alpha-2 code of ISO 639
*/
public String languageByMetadata() {
if (this.languages == null) return null;
if (this.languages.size() == 0) return null;
if (this.languages.size() == 1) return languages.iterator().next();
if (this.languages.contains(this.source.language())) return this.source.language();
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
return languages.iterator().next();
}
/*
DC according to rfc 5013

View File

@ -57,6 +57,7 @@ public class plasmaSearchRankingProfile {
public static final String CATHASVIDEO = "cathasvideo";
public static final String CATHASAPP = "cathasapp";
public static final String TERMFREQUENCY = "tf";
public static final String LANGUAGE = "language"; // ranking of preferred language
// post-sort predicates
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
@ -74,7 +75,7 @@ public class plasmaSearchRankingProfile {
coeff_appurl, coeff_app_dc_title, coeff_app_dc_creator, coeff_app_dc_subject, coeff_app_dc_description, coeff_appemph,
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer,
coeff_termfrequency;
coeff_termfrequency, coeff_language;
public plasmaSearchRankingProfile(final int mediatype) {
// set default-values
@ -109,6 +110,7 @@ public class plasmaSearchRankingProfile {
coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2;
coeff_prefer = 14;
coeff_language = 13;
}
public plasmaSearchRankingProfile(final String prefix, final String profile) {
@ -160,6 +162,7 @@ public class plasmaSearchRankingProfile {
coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
coeff_language = parseMap(coeff, LANGUAGE, coeff_language);
}
}
@ -209,6 +212,7 @@ public class plasmaSearchRankingProfile {
ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
ext.put(prefix + TERMFREQUENCY, Integer.toString(coeff_termfrequency));
ext.put(prefix + LANGUAGE, Integer.toString(coeff_language));
return ext;
}

View File

@ -819,12 +819,29 @@ public final class plasmaWordIndex implements indexRI {
final yacyURL referrerURL = entry.referrerURL();
final Date docDate = entry.getModificationDate();
String language = condenser.language();
String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
if (language == null) {
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " FAILED, taking TLD");
language = entry.url().language();
language = (bymetadata == null) ? entry.url().language() : bymetadata;
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else {
System.out.println("*** DEBUG LANGUAGE: identification of " + entry.url() + " SUCCESS: " + language);
if (language.equals("pl")) language = entry.url().language(); // patch a bug TODO: remove this if bug is fixed
if (language.equals("pl")) {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " HAS BUG: " + language);
language = (bymetadata == null) ? entry.url().language() : bymetadata; // extra handling of this case: overwrite with bymetadata
} else {
if (bymetadata == null) {
if (language.equals(entry.url().language()))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IDENTICAL: " + language);
else {
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by the TLD is " + entry.url().language() + ")");
language = entry.url().language();
}
} else {
if (language.equals(bymetadata))
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
else
System.out.println("*** DEBUG LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " (the language given by metadata is " + bymetadata + ")");
}
}
}
// create a new loaded URL db entry

View File

@ -0,0 +1,197 @@
// iso639.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 19.09.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.tools;
import java.util.HashMap;
public class iso639 {
static final String[] codes = {
"aa-Afar",
"ab-Abkhazian",
"af-Afrikaans",
"am-Amharic",
"ar-Arabic",
"as-Assamese",
"ay-Aymara",
"az-Azerbaijani",
"ba-Bashkir",
"be-Byelorussian",
"bg-Bulgarian",
"bh-Bihari",
"bi-Bislama",
"bn-Bengali;-Bangla",
"bo-Tibetan",
"br-Breton",
"ca-Catalan",
"co-Corsican",
"cs-Czech",
"cy-Welsh",
"da-Danish",
"de-German",
"dz-Bhutani",
"el-Greek",
"en-English",
"eo-Esperanto",
"es-Spanish",
"et-Estonian",
"eu-Basque",
"fa-Persian",
"fi-Finnish",
"fj-Fiji",
"fo-Faeroese",
"fr-French",
"fy-Frisian",
"ga-Irish",
"gd-Scots-Gaelic",
"gl-Galician",
"gn-Guarani",
"gu-Gujarati",
"ha-Hausa",
"hi-Hindi",
"hr-Croatian",
"hu-Hungarian",
"hy-Armenian",
"ia-Interlingua",
"ie-Interlingue",
"ik-Inupiak",
"in-Indonesian",
"is-Icelandic",
"it-Italian",
"iw-Hebrew",
"ja-Japanese",
"ji-Yiddish",
"jw-Javanese",
"ka-Georgian",
"kk-Kazakh",
"kl-Greenlandic",
"km-Cambodian",
"kn-Kannada",
"ko-Korean",
"ks-Kashmiri",
"ku-Kurdish",
"ky-Kirghiz",
"la-Latin",
"ln-Lingala",
"lo-Laothian",
"lt-Lithuanian",
"lv-Latvian,-Lettish",
"mg-Malagasy",
"mi-Maori",
"mk-Macedonian",
"ml-Malayalam",
"mn-Mongolian",
"mo-Moldavian",
"mr-Marathi",
"ms-Malay",
"mt-Maltese",
"my-Burmese",
"na-Nauru",
"ne-Nepali",
"nl-Dutch",
"no-Norwegian",
"oc-Occitan",
"om-(Afan)-Oromo",
"or-Oriya",
"pa-Punjabi",
"pl-Polish",
"ps-Pashto,-Pushto",
"pt-Portuguese",
"qu-Quechua",
"rm-Rhaeto-Romance",
"rn-Kirundi",
"ro-Romanian",
"ru-Russian",
"rw-Kinyarwanda",
"sa-Sanskrit",
"sd-Sindhi",
"sg-Sangro",
"sh-Serbo-Croatian",
"si-Singhalese",
"sk-Slovak",
"sl-Slovenian",
"sm-Samoan",
"sn-Shona",
"so-Somali",
"sq-Albanian",
"sr-Serbian",
"ss-Siswati",
"st-Sesotho",
"su-Sundanese",
"sv-Swedish",
"sw-Swahili",
"ta-Tamil",
"te-Tegulu",
"tg-Tajik",
"th-Thai",
"ti-Tigrinya",
"tk-Turkmen",
"tl-Tagalog",
"tn-Setswana",
"to-Tonga",
"tr-Turkish",
"ts-Tsonga",
"tt-Tatar",
"tw-Twi",
"uk-Ukrainian",
"ur-Urdu",
"uz-Uzbek",
"vi-Vietnamese",
"vo-Volapuk",
"wo-Wolof",
"xh-Xhosa",
"yo-Yoruba",
"zh-Chinese",
"zu-Zulu"};
static HashMap<String, String> mapping = new HashMap<String, String>();
static {
for (int i = 0; i < codes.length; i++) {
mapping.put(codes[i].substring(0, 2), codes[i].substring(3));
}
}
/**
* get the name of the alpha-2 country code
* @param code, the mnemonic of the country in alpha-2
* @return the name of the country
*/
public static final String country(String code) {
return mapping.get(code.toLowerCase());
}
/**
* see if the given country in alpha-2 country code exists
* @param code, the mnemonic of the country in alpha-2
* @return true if the code exists
*/
public static final boolean exists(String code) {
return mapping.containsKey(code.toLowerCase());
}
}

View File

@ -848,7 +848,7 @@ public class yacyURL implements Serializable {
// language calculation
public String language() {
String language = "uk";
String language = "en";
final int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language;