added option to enrich vocabularies with synonyms from synonym database

This commit is contained in:
Michael Peter Christen 2014-11-19 18:12:43 +01:00
parent 6a2a669db4
commit 0dc6e0a5f2
3 changed files with 17 additions and 4 deletions

View File

@ -134,6 +134,8 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dd><input type="number" id="discovercolumnobjectlink" name="discovercolumnobjectlink" min="-1" max="99" step="1" size="2" value="-1" disabled="disabled" style="width:50px;"> (first has index 0, if unused set -1)</dd>
<dt><i>Charset of Import File</i></dt>
<dd><select name="charset">#{charset}#<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>#{/charset}#</select></dd>
<dt><i>Auto-Enrich with Synonyms from Stemming Library</i></dt>
<dd><input type="checkbox" name="discoverenrichsynonyms" id="discoverenrichsynonyms" checked="checked" /></dd>
</dl>
</dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd>

View File

@ -29,9 +29,11 @@ import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
@ -80,6 +82,7 @@ public class Vocabulary_p {
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null;
final boolean discoverenrichsynonyms = post.getBoolean("discoverenrichsynonyms");
Segment segment = sb.index;
String t;
if (!discoverNot) {
@ -88,10 +91,17 @@ public class Vocabulary_p {
String line = null;
while ((line = r.readLine()) != null) {
String[] l = line.split(";");
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral];
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink];
String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim();
String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim();
if (literal != null && literal.length() > 0) {
table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink));
String synonyms = Tagging.normalizeTerm(literal);
if (discoverenrichsynonyms) {
Set<String> sy = SynonymLibrary.getSynonyms(literal);
if (sy != null) {
for (String s: sy) synonyms += "," + s;
}
}
table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink));
}
}
} else {

View File

@ -94,7 +94,8 @@ public class SynonymLibrary {
* @return a list of synonyms bot without the requested word
*/
public static Set<String> getSynonyms(String word) {
word = word.toLowerCase();
if (word == null) return null;
word = word.toLowerCase().trim();
if (word.length() < 2) return null;
String key = word.substring(0, 2);
List<Set<String>> symsetlist = lib.get(key);