added new word recommendation library in DictionaryLoader_p.html

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7913 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-09-01 10:14:17 +00:00
parent 1c007188ad
commit b5252ef91f
4 changed files with 196 additions and 75 deletions

View File

@ -94,6 +94,48 @@
</fieldset>
</form>
<form action="DictionaryLoader_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>Suggestions</legend>
Suggestion dictionaries will help YaCy to provide better suggestions during the input of search words
<h4><a href="http://www.ids-mannheim.de/kl/derewo/">DeReWo - Korpusbasierte Grund-/Wortformenlisten (German) of 'Institut f&uuml;r Deutsche Sprache'</a></h4>
<p>This file provides 100000 most common german words for suggestions</p>
<dl>
<dt><label>Download from</label></dt>
<dd>#[drw0URL]#</dd>
<dt><label>Storage location</label></dt>
<dd>#[drw0Storage]#</dd>
<dt><label>Status</label></dt>
<dd>#(drw0Status)#<div class="info">not loaded</div>::<div class="commit">loaded</div>::deactivated#(/drw0Status)#</dd>
<dt>Action</dt>
<dd>#(drw0Status)#
<input type="submit" name="drw0Load" value="Load" />::
<input type="submit" name="drw0Deactivate" value="Deactivate" />
<input type="submit" name="drw0Remove" value="Remove" />::
<input type="submit" name="drw0Activate" value="Activate" />
<input type="submit" name="drw0Remove" value="Remove" />
#(/drw0Status)#</dd>
#(drw0ActionLoaded)#::
<dt>Result</dt><dd><div class="commit">loaded and activated dictionary file</div></dd>::
<dt>Result</dt><dd><div class="error">loading of dictionary file failed: #[error]#</div></dd>
#(/drw0ActionLoaded)#
#(drw0ActionRemoved)#::
<dt>Result</dt><dd><div class="commit">deactivated and removed dictionary file</div></dd>::
<dt>Result</dt><dd><div class="error">cannot remove dictionary file: #[error]#</div></dd>
#(/drw0ActionRemoved)#
#(drw0ActionDeactivated)#::
<dt>Result</dt><dd><div class="commit">deactivated dictionary file</div></dd>::
<dt>Result</dt><dd><div class="error">cannot deactivate dictionary file: #[error]#</div></dd>
#(/drw0ActionDeactivated)#
#(drw0ActionActivated)#::
<dt>Result</dt><dd><div class="commit">activated dictionary file</div></dd>::
<dt>Result</dt><dd><div class="error">cannot activate dictionary file: #[error]#</div></dd>
#(/drw0ActionActivated)#
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

View File

@ -140,6 +140,50 @@ public class DictionaryLoader_p {
prop.put("geo1ActionActivated", 1);
}
// DRW0
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.integrateDeReWo();
LibraryProvider.initDidYouMean();
prop.put("drw0Status", LibraryProvider.Dictionary.DRW0.file().exists() ? 1 : 0);
prop.put("drw0ActionLoaded", 1);
} catch (final MalformedURLException e) {
Log.logException(e);
prop.put("drw0ActionLoaded", 2);
prop.put("drw0ActionLoaded_error", e.getMessage());
} catch (final IOException e) {
Log.logException(e);
prop.put("drw0ActionLoaded", 2);
prop.put("drw0ActionLoaded_error", e.getMessage());
}
}
if (post.containsKey("drw0Remove")) {
LibraryProvider.removeDeReWo();
LibraryProvider.initDidYouMean();
FileUtils.deletedelete(LibraryProvider.Dictionary.DRW0.file());
FileUtils.deletedelete(LibraryProvider.Dictionary.DRW0.fileDisabled());
prop.put("drw0ActionRemoved", 1);
}
if (post.containsKey("drw0Deactivate")) {
LibraryProvider.removeDeReWo();
LibraryProvider.initDidYouMean();
LibraryProvider.Dictionary.DRW0.file().renameTo(LibraryProvider.Dictionary.DRW0.fileDisabled());
prop.put("drw0ActionDeactivated", 1);
}
if (post.containsKey("drw0Activate")) {
LibraryProvider.Dictionary.DRW0.fileDisabled().renameTo(LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.integrateDeReWo();
LibraryProvider.initDidYouMean();
prop.put("drw0ActionActivated", 1);
}
// check status again
for (final LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0);

View File

@ -2,19 +2,19 @@
* LibraryProvider.java
* Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 01.10.2009 on http://yacy.net
*
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -25,56 +25,60 @@ package net.yacy.document;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.geolocalization.GeonamesLocalization;
import net.yacy.document.geolocalization.OpenGeoDBLocalization;
import net.yacy.document.geolocalization.OverarchingLocalization;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
public class LibraryProvider {
private static final String path_to_source_dictionaries = "source";
private static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final String disabledExtension = ".disabled";
public static WordCache dymLib = new WordCache(null);
public static OverarchingLocalization geoLoc = new OverarchingLocalization();
private static File dictSource = null;
private static File dictRoot = null;
public static enum Dictionary {
GEODB0("geo0",
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz",
"opengeodb-0.2.5a-UTF8-sql.gz"),
GEODB1("geo1",
"http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz",
"opengeodb-02621_2010-03-16.sql.gz"),
GEON0("geon0",
"http://download.geonames.org/export/dump/cities1000.zip",
"cities1000.zip");
GEODB0("geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz"),
GEODB1("geo1", "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz"),
GEON0("geon0", "http://download.geonames.org/export/dump/cities1000.zip"),
DRW0("drw0", "http://www.ids-mannheim.de/kl/derewo/derewo-v-100000t-2009-04-30-0.1.zip");
public String nickname, url, filename;
private Dictionary(String nickname, String url, String filename) {
private Dictionary(final String nickname, final String url) {
try {
this.filename = new MultiProtocolURI(url).getFileName();
} catch (final MalformedURLException e) {
assert false;
}
this.nickname = nickname;
this.url = url;
this.filename = filename;
}
public File file() {
return new File(dictSource, filename);
return new File(dictSource, this.filename);
}
public File fileDisabled() {
return new File(dictSource, filename + disabledExtension);
return new File(dictSource, this.filename + disabledExtension);
}
}
@ -83,7 +87,7 @@ public class LibraryProvider {
* This assigns default paths, and initializes the dictionary classes
* Additionally, if default dictionaries are given in the source path,
* they are translated into the input format inside the DATA/DICTIONARIES directory
*
*
* @param pathToSource
* @param pathToDICTIONARIES
*/
@ -91,17 +95,17 @@ public class LibraryProvider {
dictSource = new File(rootPath, path_to_source_dictionaries);
if (!dictSource.exists()) dictSource.mkdirs();
dictRoot = rootPath;
// initialize libraries
integrateDeReWo();
initDidYouMean();
integrateOpenGeoDB();
integrateGeonames();
}
public static void integrateOpenGeoDB() {
File geo1 = Dictionary.GEODB1.file();
File geo0 = Dictionary.GEODB0.file();
final File geo1 = Dictionary.GEODB1.file();
final File geo0 = Dictionary.GEODB0.file();
if (geo1.exists()) {
if (geo0.exists()) geo0.renameTo(Dictionary.GEODB0.fileDisabled());
geoLoc.addLocalization(Dictionary.GEODB1.nickname, new OpenGeoDBLocalization(geo1, false));
@ -112,39 +116,45 @@ public class LibraryProvider {
return;
}
}
public static void integrateGeonames() {
File geon = Dictionary.GEON0.file();
final File geon = Dictionary.GEON0.file();
if (geon.exists()) {
geoLoc.addLocalization(Dictionary.GEON0.nickname, new GeonamesLocalization(geon));
return;
}
}
public static void initDidYouMean() {
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
if (!dymDict.exists()) dymDict.mkdirs();
dymLib = new WordCache(dymDict);
}
public static void removeDeReWo() {
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
final File derewoInput = LibraryProvider.Dictionary.DRW0.file();
final File derewoOutput = new File(dymDict, derewoInput.getName() + ".words");
FileUtils.deletedelete(derewoOutput);
}
public static void integrateDeReWo() {
// translate input files (once..)
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
if (!dymDict.exists()) dymDict.mkdirs();
final File pathToSource = new File(dictRoot, path_to_source_dictionaries);
final File derewoInput = new File(pathToSource, "derewo-v-30000g-2007-12-31-0.1.txt");
final File derewoOutput = new File(dymDict, "derewo-v-30000g-2007-12-31-0.1.words");
final File derewoInput = LibraryProvider.Dictionary.DRW0.file();
final File derewoOutput = new File(dymDict, derewoInput.getName() + ".words");
if (!derewoOutput.exists() && derewoInput.exists()) {
// create the translation of the derewo file (which is easy in this case)
final ArrayList<String> derewo = loadDeReWo(derewoInput, true);
try {
writeWords(derewoOutput, derewo);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
}
/*
private static ArrayList<String> loadList(final File file, String comment, boolean toLowerCase) {
final ArrayList<String> list = new ArrayList<String>();
@ -168,44 +178,68 @@ public class LibraryProvider {
return list;
}
*/
private static Set<String> sortUnique(final List<String> list) {
final Set<String> s = new TreeSet<String>();
for (final String t: list) s.add(t);
return s;
}
private static void writeWords(final File f, final ArrayList<String> list) throws IOException {
final Set<String> s = sortUnique(list);
final PrintWriter w = new PrintWriter(new BufferedWriter(new FileWriter(f)));
for (final String t: s) w.println(t);
w.close();
}
private static ArrayList<String> loadDeReWo(final File file, final boolean toLowerCase) {
final ArrayList<String> list = new ArrayList<String>();
// get the zip file entry from the file
InputStream derewoTxtEntry;
try {
final ZipFile zip = new ZipFile(file);
/*
final Enumeration<? extends ZipEntry> i = zip.entries();
while (i.hasMoreElements()) {
final ZipEntry e = i.nextElement();
System.out.println("loadDeReWo: " + e.getName());
}
*/
derewoTxtEntry = zip.getInputStream(zip.getEntry("derewo-v-100000t-2009-04-30-0.1"));
} catch (final ZipException e) {
Log.logException(e);
return list;
} catch (final IOException e) {
Log.logException(e);
return list;
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
reader = new BufferedReader(new InputStreamReader(derewoTxtEntry, "UTF-8"));
String line;
// read until text starts
while ((line = reader.readLine()) != null) {
if (line.startsWith("-----")) break;
if (line.startsWith("# -----")) break;
}
// read empty line
line = reader.readLine();
// read lines
int p;
int c;
//int c;
String w;
while ((line = reader.readLine()) != null) {
line = line.trim();
p = line.indexOf("\t");
p = line.indexOf(" ");
if (p > 0) {
c = Integer.parseInt(line.substring(p + 1));
if (c < 1) continue;
list.add((toLowerCase) ? line.substring(0, p).trim().toLowerCase() : line.substring(0, p).trim());
//c = Integer.parseInt(line.substring(p + 1));
//if (c < 1) continue;
w = (toLowerCase) ? line.substring(0, p).trim().toLowerCase() : line.substring(0, p).trim();
if (w.length() < 4) continue;
list.add(w);
}
}
reader.close();
@ -216,13 +250,13 @@ public class LibraryProvider {
}
return list;
}
public static void main(String[] args) {
File here = new File("dummy").getParentFile();
public static void main(final String[] args) {
final File here = new File("dummy").getParentFile();
initialize(new File(here, "DATA/DICTIONARIES"));
System.out.println("dymDict-size = " + dymLib.size());
Set<String> r = dymLib.recommend("da");
for (String s: r) {
final Set<String> r = dymLib.recommend("da");
for (final String s: r) {
System.out.println("$ " + s);
}
System.out.println("recommendations: " + r.size());

View File

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -45,12 +45,12 @@ import net.yacy.kelondro.util.MemoryControl;
*
*/
public class WordCache {
// common word cache
private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
private static final int commonWordsMinLength = 5; // words must have that length at minimum
private OrderedScoreMap<String> commonWords = new OrderedScoreMap<String>(String.CASE_INSENSITIVE_ORDER);
private static OrderedScoreMap<String> commonWords = new OrderedScoreMap<String>(String.CASE_INSENSITIVE_ORDER);
// dictionaries
private final File dictionaryPath;
private TreeSet<String> dict; // the word dictionary
@ -67,12 +67,12 @@ public class WordCache {
this.dictionaryPath = dictionaryPath;
reload();
}
/**
* add a word to the generic dictionary
* @param word
*/
public void learn(String word) {
public static void learn(final String word) {
if (word == null) return;
if (word.length() < commonWordsMinLength) return;
if (MemoryControl.shortStatus()) commonWords.clear();
@ -81,24 +81,24 @@ public class WordCache {
commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
}
}
/**
* scan the input directory and load all dictionaries (again)
*/
public void reload() {
this.dict = new TreeSet<String>();
this.tcid = new TreeSet<String>();
if (dictionaryPath == null || !dictionaryPath.exists()) return;
final String[] files = dictionaryPath.list();
if (this.dictionaryPath == null || !this.dictionaryPath.exists()) return;
final String[] files = this.dictionaryPath.list();
for (final String f: files) {
if (f.endsWith(".words")) try {
inputStream(new File(dictionaryPath, f));
} catch (IOException e) {
inputStream(new File(this.dictionaryPath, f));
} catch (final IOException e) {
Log.logException(e);
}
}
}
private void inputStream(final File file) throws IOException {
InputStream is = new FileInputStream(file);
if (file.getName().endsWith(".gz")) {
@ -110,22 +110,23 @@ public class WordCache {
while ((l = reader.readLine()) != null) {
if (l.length() == 0 || l.charAt(0) == '#') continue;
l = l.trim().toLowerCase();
if (l.length() < 4) continue;
this.dict.add(l);
this.tcid.add(reverse(l));
}
} catch (IOException e) {
} catch (final IOException e) {
// finish
}
}
private static String reverse(final String s) {
StringBuilder sb = new StringBuilder(s.length());
final StringBuilder sb = new StringBuilder(s.length());
for (int i = s.length() - 1; i >= 0; i--) {
sb.append(s.charAt(i));
}
return sb.toString();
}
/**
* read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations
@ -138,14 +139,14 @@ public class WordCache {
for (final String r: t) {
if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
}
SortedMap<String, AtomicInteger> u = this.commonWords.tailMap(string);
final SortedMap<String, AtomicInteger> u = commonWords.tailMap(string);
String vv;
try {
for (final Map.Entry<String, AtomicInteger> v: u.entrySet()) {
vv = v.getKey();
if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
}
} catch (ConcurrentModificationException e) {}
} catch (final ConcurrentModificationException e) {}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final String r: t) {
@ -153,7 +154,7 @@ public class WordCache {
}
return ret;
}
/**
* check if the library contains the given word
* @param s the given word
@ -164,7 +165,7 @@ public class WordCache {
// if the above case is true then it is also true for this.tcid and vice versa
// that means it does not need to be tested as well
}
/**
* check if the library supports the given word
* A word is supported, if the library contains a word
@ -185,7 +186,7 @@ public class WordCache {
}
return false;
}
/**
* the size of the dictionay
* @return the number of words in the dictionary
@ -193,7 +194,7 @@ public class WordCache {
public int size() {
return this.dict.size();
}
/**
* a property that is used during the construction of recommendation:
@ -206,5 +207,5 @@ public class WordCache {
public boolean isRelevant(final int minimumWords) {
return this.dict.size() >= minimumWords;
}
}