From 0dc6e0a5f24b4976e534124401ec9f936fb16c2a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 19 Nov 2014 18:12:43 +0100 Subject: [PATCH] added option to enrich vocabularies with synonyms from synonym database --- htroot/Vocabulary_p.html | 2 ++ htroot/Vocabulary_p.java | 16 +++++++++++++--- .../cora/language/synonyms/SynonymLibrary.java | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index d4418c2f0..e0b505459 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -134,6 +134,8 @@ To see a list of all APIs, please visit the (first has index 0, if unused set -1)
Charset of Import File
+
Auto-Enrich with Synonyms from Stemming Library
+
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 7abd40f1b..8fd2fe3ec 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -29,9 +29,11 @@ import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Set; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; @@ -80,6 +82,7 @@ public class Vocabulary_p { final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final File discoverFromCSVFile = discoverFromCSVPath.length() > 0 ? new File(discoverFromCSVPath) : null; + final boolean discoverenrichsynonyms = post.getBoolean("discoverenrichsynonyms"); Segment segment = sb.index; String t; if (!discoverNot) { @@ -88,10 +91,17 @@ public class Vocabulary_p { String line = null; while ((line = r.readLine()) != null) { String[] l = line.split(";"); - String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral]; - String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink]; + String literal = discovercolumnliteral < 0 || l.length <= discovercolumnliteral ? null : l[discovercolumnliteral].trim(); + String objectlink = discovercolumnobjectlink < 0 || l.length <= discovercolumnobjectlink ? null : l[discovercolumnobjectlink].trim(); if (literal != null && literal.length() > 0) { - table.put(literal, new Tagging.SOTuple(Tagging.normalizeTerm(literal), objectlink == null ? "" : objectlink)); + String synonyms = Tagging.normalizeTerm(literal); + if (discoverenrichsynonyms) { + Set sy = SynonymLibrary.getSynonyms(literal); + if (sy != null) { + for (String s: sy) synonyms += "," + s; + } + } + table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink)); } } } else { diff --git a/source/net/yacy/cora/language/synonyms/SynonymLibrary.java b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java index ecd39a808..e0657cad3 100644 --- a/source/net/yacy/cora/language/synonyms/SynonymLibrary.java +++ b/source/net/yacy/cora/language/synonyms/SynonymLibrary.java @@ -94,7 +94,8 @@ public class SynonymLibrary { * @return a list of synonyms bot without the requested word */ public static Set getSynonyms(String word) { - word = word.toLowerCase(); + if (word == null) return null; + word = word.toLowerCase().trim(); if (word.length() < 2) return null; String key = word.substring(0, 2); List> symsetlist = lib.get(key);