diff --git a/source/net/yacy/cora/language/phonetic/ColognePhonetic.java b/source/net/yacy/cora/language/phonetic/ColognePhonetic.java new file mode 100644 index 000000000..be37e2035 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/ColognePhonetic.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import java.util.Locale; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + *

+ * Encodes a string into a Cologne Phonetic value. + *

+ *

+ * Implements the Kölner Phonetik (Cologne Phonetic) + * algorithm issued by Hans Joachim Postel in 1969. + *

+ * + *

+ * The Kölner Phonetik is a phonetic algorithm which is optimized for the German language. It is related to the + * well-known soundex algorithm. + *

+ * + *

Algorithm

+ * + * + * + * @see Wikipedia (de): Kölner Phonetik (in German) + * @author Apache Software Foundation + * @since 1.5 + */ +public class ColognePhonetic implements StringEncoder { + + private abstract class CologneBuffer { + + protected final char[] data; + + protected int length = 0; + + public CologneBuffer(char[] data) { + this.data = data; + this.length = data.length; + } + + public CologneBuffer(int buffSize) { + this.data = new char[buffSize]; + this.length = 0; + } + + protected abstract char[] copyData(int start, final int length); + + public int length() { + return length; + } + + @Override + public String toString() { + return new String(copyData(0, length)); + } + } + + private class CologneOutputBuffer extends CologneBuffer { + + public CologneOutputBuffer(int buffSize) { + super(buffSize); + } + + public void addRight(char chr) { + data[length] = chr; + length++; + } + + @Override + protected char[] copyData(int start, final int length) { + char[] newData = new char[length]; + System.arraycopy(data, start, newData, 0, length); + return newData; + } + } + + private class CologneInputBuffer extends CologneBuffer { + + public CologneInputBuffer(char[] data) { + super(data); + } + + public void addLeft(char ch) { + length++; + data[getNextPos()] = ch; + } + + @Override + protected char[] copyData(int start, final int length) { + char[] newData = new char[length]; + System.arraycopy(data, data.length - this.length + start, newData, 0, length); + return newData; + } + + public char getNextChar() { + return data[getNextPos()]; + } + + protected int getNextPos() { + return data.length - length; + } + + public char removeNext() { + char ch = getNextChar(); + length--; + return ch; + } + } + + /** + * Maps some Germanic characters to plain for internal processing. The following characters are mapped: + * + */ + private static final char[][] PREPROCESS_MAP = new char[][]{ + {'\u00C4', 'A'}, // capital a, umlaut mark + {'\u00DC', 'U'}, // capital u, umlaut mark + {'\u00D6', 'O'}, // capital o, umlaut mark + {'\u00DF', 'S'} // small sharp s, German + }; + + /* + * Returns whether the array contains the key, or not. + */ + private static boolean arrayContains(char[] arr, char key) { + for (char element : arr) { + if (element == key) { + return true; + } + } + return false; + } + + /** + *

+ * Implements the Kölner Phonetik algorithm. + *

+ *

+ * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. + *

+ * + * @param text + * @return the corresponding encoding according to the Kölner Phonetik algorithm + */ + public String colognePhonetic(String text) { + if (text == null) { + return null; + } + + text = preprocess(text); + + CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2); + CologneInputBuffer input = new CologneInputBuffer(text.toCharArray()); + + char nextChar; + + char lastChar = '-'; + char lastCode = '/'; + char code; + char chr; + + int rightLength = input.length(); + + while (rightLength > 0) { + chr = input.removeNext(); + + if ((rightLength = input.length()) > 0) { + nextChar = input.getNextChar(); + } else { + nextChar = '-'; + } + + if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) { + code = '0'; + } else if (chr == 'H' || chr < 'A' || chr > 'Z') { + if (lastCode == '/') { + continue; + } + code = '-'; + } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) { + code = '1'; + } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) { + code = '2'; + } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) { + code = '3'; + } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) { + code = '4'; + } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) { + code = '4'; + input.addLeft('S'); + rightLength++; + } else if (chr == 'S' || chr == 'Z') { + code = '8'; + } else if (chr == 'C') { + if (lastCode == '/') { + if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) { + code = '4'; + } else { + code = '8'; + } + } else { + if (arrayContains(new char[]{'S', 'Z'}, lastChar) || + !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) { + code = '8'; + } else { + code = '4'; + } + } + } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) { + code = '8'; + } else if (chr == 'R') { + code = '7'; + } else if (chr == 'L') { + code = '5'; + } else if (chr == 'M' || chr == 'N') { + code = '6'; + } else { + code = chr; + } + + if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) { + output.addRight(code); + } + + lastChar = chr; + lastCode = code; + } + return output.toString(); + } + + public Object encode(Object object) throws EncoderException { + if (!(object instanceof String)) { + throw new EncoderException("This method's parameter was expected to be of the type " + + String.class.getName() + + ". But actually it was of the type " + + object.getClass().getName() + + "."); + } + return encode((String) object); + } + + public String encode(String text) { + return colognePhonetic(text); + } + + public boolean isEncodeEqual(String text1, String text2) { + return colognePhonetic(text1).equals(colognePhonetic(text2)); + } + + /** + * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}. + */ + private String preprocess(String text) { + text = text.toUpperCase(Locale.GERMAN); + + char[] chrs = text.toCharArray(); + + for (int index = 0; index < chrs.length; index++) { + if (chrs[index] > 'Z') { + for (char[] element : PREPROCESS_MAP) { + if (chrs[index] == element[0]) { + chrs[index] = element[1]; + break; + } + } + } + } + return new String(chrs); + } +} diff --git a/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java b/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java new file mode 100644 index 000000000..1ccbf73c2 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java @@ -0,0 +1,1105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a double metaphone value. + * This Implementation is based on the algorithm by Lawrence Philips. + * + * + * @author Apache Software Foundation + * @version $Id: DoubleMetaphone.java 1157192 2011-08-12 17:27:38Z ggregory $ + */ +public class DoubleMetaphone implements StringEncoder { + + /** + * "Vowels" to test for + */ + private static final String VOWELS = "AEIOUY"; + + /** + * Prefixes when present which are not pronounced + */ + private static final String[] SILENT_START = + { "GN", "KN", "PN", "WR", "PS" }; + private static final String[] L_R_N_M_B_H_F_V_W_SPACE = + { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; + private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = + { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; + private static final String[] L_T_K_S_N_M_B_Z = + { "L", "T", "K", "S", "N", "M", "B", "Z" }; + + /** + * Maximum length of an encoding, default is 4 + */ + private int maxCodeLen = 4; + + /** + * Creates an instance of this DoubleMetaphone encoder + */ + public DoubleMetaphone() { + super(); + } + + /** + * Encode a value with Double Metaphone + * + * @param value String to encode + * @return an encoded string + */ + public String doubleMetaphone(String value) { + return doubleMetaphone(value, false); + } + + /** + * Encode a value with Double Metaphone, optionally using the alternate + * encoding. + * + * @param value String to encode + * @param alternate use alternate encode + * @return an encoded string + */ + public String doubleMetaphone(String value, boolean alternate) { + value = cleanInput(value); + if (value == null) { + return null; + } + + boolean slavoGermanic = isSlavoGermanic(value); + int index = isSilentStart(value) ? 1 : 0; + + DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); + + while (!result.isComplete() && index <= value.length() - 1) { + switch (value.charAt(index)) { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + index = handleAEIOUY(result, index); + break; + case 'B': + result.append('P'); + index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; + break; + case '\u00C7': + // A C with a Cedilla + result.append('S'); + index++; + break; + case 'C': + index = handleC(value, result, index); + break; + case 'D': + index = handleD(value, result, index); + break; + case 'F': + result.append('F'); + index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; + break; + case 'G': + index = handleG(value, result, index, slavoGermanic); + break; + case 'H': + index = handleH(value, result, index); + break; + case 'J': + index = handleJ(value, result, index, slavoGermanic); + break; + case 'K': + result.append('K'); + index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; + break; + case 'L': + index = handleL(value, result, index); + break; + case 'M': + result.append('M'); + index = conditionM0(value, index) ? index + 2 : index + 1; + break; + case 'N': + result.append('N'); + index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; + break; + case '\u00D1': + // N with a tilde (spanish ene) + result.append('N'); + index++; + break; + case 'P': + index = handleP(value, result, index); + break; + case 'Q': + result.append('K'); + index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; + break; + case 'R': + index = handleR(value, result, index, slavoGermanic); + break; + case 'S': + index = handleS(value, result, index, slavoGermanic); + break; + case 'T': + index = handleT(value, result, index); + break; + case 'V': + result.append('F'); + index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; + break; + case 'W': + index = handleW(value, result, index); + break; + case 'X': + index = handleX(value, result, index); + break; + case 'Z': + index = handleZ(value, result, index, slavoGermanic); + break; + default: + index++; + break; + } + } + + return alternate ? result.getAlternate() : result.getPrimary(); + } + + /** + * Encode the value using DoubleMetaphone. It will only work if + * obj is a String (like Metaphone). + * + * @param obj Object to encode (should be of type String) + * @return An encoded Object (will be of type String) + * @throws EncoderException encode parameter is not of type String + */ + public Object encode(Object obj) throws EncoderException { + if (!(obj instanceof String)) { + throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); + } + return doubleMetaphone((String) obj); + } + + /** + * Encode the value using DoubleMetaphone. + * + * @param value String to encode + * @return An encoded String + */ + public String encode(String value) { + return doubleMetaphone(value); + } + + /** + * Check if the Double Metaphone values of two String values + * are equal. + * + * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. + * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. + * @return true if the encoded Strings are equal; + * false otherwise. + * @see #isDoubleMetaphoneEqual(String,String,boolean) + */ + public boolean isDoubleMetaphoneEqual(String value1, String value2) { + return isDoubleMetaphoneEqual(value1, value2, false); + } + + /** + * Check if the Double Metaphone values of two String values + * are equal, optionally using the alternate value. + * + * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. + * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. + * @param alternate use the alternate value if true. + * @return true if the encoded Strings are equal; + * false otherwise. + */ + public boolean isDoubleMetaphoneEqual(String value1, + String value2, + boolean alternate) { + return doubleMetaphone(value1, alternate).equals(doubleMetaphone + (value2, alternate)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { + return this.maxCodeLen; + } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { + this.maxCodeLen = maxCodeLen; + } + + //-- BEGIN HANDLERS --// + + /** + * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases + */ + private int handleAEIOUY(DoubleMetaphoneResult result, int + index) { + if (index == 0) { + result.append('A'); + } + return index + 1; + } + + /** + * Handles 'C' cases + */ + private int handleC(String value, + DoubleMetaphoneResult result, + int index) { + if (conditionC0(value, index)) { // very confusing, moved out + result.append('K'); + index += 2; + } else if (index == 0 && contains(value, index, 6, "CAESAR")) { + result.append('S'); + index += 2; + } else if (contains(value, index, 2, "CH")) { + index = handleCH(value, result, index); + } else if (contains(value, index, 2, "CZ") && + !contains(value, index - 2, 4, "WICZ")) { + //-- "Czerny" --// + result.append('S', 'X'); + index += 2; + } else if (contains(value, index + 1, 3, "CIA")) { + //-- "focaccia" --// + result.append('X'); + index += 3; + } else if (contains(value, index, 2, "CC") && + !(index == 1 && charAt(value, 0) == 'M')) { + //-- double "cc" but not "McClelland" --// + return handleCC(value, result, index); + } else if (contains(value, index, 2, "CK", "CG", "CQ")) { + result.append('K'); + index += 2; + } else if (contains(value, index, 2, "CI", "CE", "CY")) { + //-- Italian vs. English --// + if (contains(value, index, 3, "CIO", "CIE", "CIA")) { + result.append('S', 'X'); + } else { + result.append('S'); + } + index += 2; + } else { + result.append('K'); + if (contains(value, index + 1, 2, " C", " Q", " G")) { + //-- Mac Caffrey, Mac Gregor --// + index += 3; + } else if (contains(value, index + 1, 1, "C", "K", "Q") && + !contains(value, index + 1, 2, "CE", "CI")) { + index += 2; + } else { + index++; + } + } + + return index; + } + + /** + * Handles 'CC' cases + */ + private int handleCC(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index + 2, 1, "I", "E", "H") && + !contains(value, index + 2, 2, "HU")) { + //-- "bellocchio" but not "bacchus" --// + if ((index == 1 && charAt(value, index - 1) == 'A') || + contains(value, index - 1, 5, "UCCEE", "UCCES")) { + //-- "accident", "accede", "succeed" --// + result.append("KS"); + } else { + //-- "bacci", "bertucci", other Italian --// + result.append('X'); + } + index += 3; + } else { // Pierce's rule + result.append('K'); + index += 2; + } + + return index; + } + + /** + * Handles 'CH' cases + */ + private int handleCH(String value, + DoubleMetaphoneResult result, + int index) { + if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael + result.append('K', 'X'); + return index + 2; + } else if (conditionCH0(value, index)) { + //-- Greek roots ("chemistry", "chorus", etc.) --// + result.append('K'); + return index + 2; + } else if (conditionCH1(value, index)) { + //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// + result.append('K'); + return index + 2; + } else { + if (index > 0) { + if (contains(value, 0, 2, "MC")) { + result.append('K'); + } else { + result.append('X', 'K'); + } + } else { + result.append('X'); + } + return index + 2; + } + } + + /** + * Handles 'D' cases + */ + private int handleD(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 2, "DG")) { + //-- "Edge" --// + if (contains(value, index + 2, 1, "I", "E", "Y")) { + result.append('J'); + index += 3; + //-- "Edgar" --// + } else { + result.append("TK"); + index += 2; + } + } else if (contains(value, index, 2, "DT", "DD")) { + result.append('T'); + index += 2; + } else { + result.append('T'); + index++; + } + return index; + } + + /** + * Handles 'G' cases + */ + private int handleG(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (charAt(value, index + 1) == 'H') { + index = handleGH(value, result, index); + } else if (charAt(value, index + 1) == 'N') { + if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { + result.append("KN", "N"); + } else if (!contains(value, index + 2, 2, "EY") && + charAt(value, index + 1) != 'Y' && !slavoGermanic) { + result.append("N", "KN"); + } else { + result.append("KN"); + } + index = index + 2; + } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { + result.append("KL", "L"); + index += 2; + } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { + //-- -ges-, -gep-, -gel-, -gie- at beginning --// + result.append('K', 'J'); + index += 2; + } else if ((contains(value, index + 1, 2, "ER") || + charAt(value, index + 1) == 'Y') && + !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && + !contains(value, index - 1, 1, "E", "I") && + !contains(value, index - 1, 3, "RGY", "OGY")) { + //-- -ger-, -gy- --// + result.append('K', 'J'); + index += 2; + } else if (contains(value, index + 1, 1, "E", "I", "Y") || + contains(value, index - 1, 4, "AGGI", "OGGI")) { + //-- Italian "biaggi" --// + if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { + //-- obvious germanic --// + result.append('K'); + } else if (contains(value, index + 1, 3, "IER")) { + result.append('J'); + } else { + result.append('J', 'K'); + } + index += 2; + } else if (charAt(value, index + 1) == 'G') { + index += 2; + result.append('K'); + } else { + index++; + result.append('K'); + } + return index; + } + + /** + * Handles 'GH' cases + */ + private int handleGH(String value, + DoubleMetaphoneResult result, + int index) { + if (index > 0 && !isVowel(charAt(value, index - 1))) { + result.append('K'); + index += 2; + } else if (index == 0) { + if (charAt(value, index + 2) == 'I') { + result.append('J'); + } else { + result.append('K'); + } + index += 2; + } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || + (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || + (index > 3 && contains(value, index - 4, 1, "B", "H"))) { + //-- Parker's rule (with some further refinements) - "hugh" + index += 2; + } else { + if (index > 2 && charAt(value, index - 1) == 'U' && + contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { + //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" + result.append('F'); + } else if (index > 0 && charAt(value, index - 1) != 'I') { + result.append('K'); + } + index += 2; + } + return index; + } + + /** + * Handles 'H' cases + */ + private int handleH(String value, + DoubleMetaphoneResult result, + int index) { + //-- only keep if first & before vowel or between 2 vowels --// + if ((index == 0 || isVowel(charAt(value, index - 1))) && + isVowel(charAt(value, index + 1))) { + result.append('H'); + index += 2; + //-- also takes car of "HH" --// + } else { + index++; + } + return index; + } + + /** + * Handles 'J' cases + */ + private int handleJ(String value, DoubleMetaphoneResult result, int index, + boolean slavoGermanic) { + if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { + //-- obvious Spanish, "Jose", "San Jacinto" --// + if ((index == 0 && (charAt(value, index + 4) == ' ') || + value.length() == 4) || contains(value, 0, 4, "SAN ")) { + result.append('H'); + } else { + result.append('J', 'H'); + } + index++; + } else { + if (index == 0 && !contains(value, index, 4, "JOSE")) { + result.append('J', 'A'); + } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && + (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { + result.append('J', 'H'); + } else if (index == value.length() - 1) { + result.append('J', ' '); + } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { + result.append('J'); + } + + if (charAt(value, index + 1) == 'J') { + index += 2; + } else { + index++; + } + } + return index; + } + + /** + * Handles 'L' cases + */ + private int handleL(String value, + DoubleMetaphoneResult result, + int index) { + if (charAt(value, index + 1) == 'L') { + if (conditionL0(value, index)) { + result.appendPrimary('L'); + } else { + result.append('L'); + } + index += 2; + } else { + index++; + result.append('L'); + } + return index; + } + + /** + * Handles 'P' cases + */ + private int handleP(String value, + DoubleMetaphoneResult result, + int index) { + if (charAt(value, index + 1) == 'H') { + result.append('F'); + index += 2; + } else { + result.append('P'); + index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'R' cases + */ + private int handleR(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (index == value.length() - 1 && !slavoGermanic && + contains(value, index - 2, 2, "IE") && + !contains(value, index - 4, 2, "ME", "MA")) { + result.appendAlternate('R'); + } else { + result.append('R'); + } + return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; + } + + /** + * Handles 'S' cases + */ + private int handleS(String value, + DoubleMetaphoneResult result, + int index, + boolean slavoGermanic) { + if (contains(value, index - 1, 3, "ISL", "YSL")) { + //-- special cases "island", "isle", "carlisle", "carlysle" --// + index++; + } else if (index == 0 && contains(value, index, 5, "SUGAR")) { + //-- special case "sugar-" --// + result.append('X', 'S'); + index++; + } else if (contains(value, index, 2, "SH")) { + if (contains(value, index + 1, 4, + "HEIM", "HOEK", "HOLM", "HOLZ")) { + //-- germanic --// + result.append('S'); + } else { + result.append('X'); + } + index += 2; + } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { + //-- Italian and Armenian --// + if (slavoGermanic) { + result.append('S'); + } else { + result.append('S', 'X'); + } + index += 3; + } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { + //-- german & anglicisations, e.g. "smith" match "schmidt" // + // "snider" match "schneider" --// + //-- also, -sz- in slavic language altho in hungarian it // + // is pronounced "s" --// + result.append('S', 'X'); + index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; + } else if (contains(value, index, 2, "SC")) { + index = handleSC(value, result, index); + } else { + if (index == value.length() - 1 && contains(value, index - 2, + 2, "AI", "OI")){ + //-- french e.g. "resnais", "artois" --// + result.appendAlternate('S'); + } else { + result.append('S'); + } + index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'SC' cases + */ + private int handleSC(String value, + DoubleMetaphoneResult result, + int index) { + if (charAt(value, index + 2) == 'H') { + //-- Schlesinger's rule --// + if (contains(value, index + 3, + 2, "OO", "ER", "EN", "UY", "ED", "EM")) { + //-- Dutch origin, e.g. "school", "schooner" --// + if (contains(value, index + 3, 2, "ER", "EN")) { + //-- "schermerhorn", "schenker" --// + result.append("X", "SK"); + } else { + result.append("SK"); + } + } else { + if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { + result.append('X', 'S'); + } else { + result.append('X'); + } + } + } else if (contains(value, index + 2, 1, "I", "E", "Y")) { + result.append('S'); + } else { + result.append("SK"); + } + return index + 3; + } + + /** + * Handles 'T' cases + */ + private int handleT(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 4, "TION")) { + result.append('X'); + index += 3; + } else if (contains(value, index, 3, "TIA", "TCH")) { + result.append('X'); + index += 3; + } else if (contains(value, index, 2, "TH") || contains(value, index, + 3, "TTH")) { + if (contains(value, index + 2, 2, "OM", "AM") || + //-- special case "thomas", "thames" or germanic --// + contains(value, 0, 4, "VAN ", "VON ") || + contains(value, 0, 3, "SCH")) { + result.append('T'); + } else { + result.append('0', 'T'); + } + index += 2; + } else { + result.append('T'); + index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'W' cases + */ + private int handleW(String value, + DoubleMetaphoneResult result, + int index) { + if (contains(value, index, 2, "WR")) { + //-- can also be in middle of word --// + result.append('R'); + index += 2; + } else { + if (index == 0 && (isVowel(charAt(value, index + 1)) || + contains(value, index, 2, "WH"))) { + if (isVowel(charAt(value, index + 1))) { + //-- Wasserman should match Vasserman --// + result.append('A', 'F'); + } else { + //-- need Uomo to match Womo --// + result.append('A'); + } + index++; + } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || + contains(value, index - 1, + 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || + contains(value, 0, 3, "SCH")) { + //-- Arnow should match Arnoff --// + result.appendAlternate('F'); + index++; + } else if (contains(value, index, 4, "WICZ", "WITZ")) { + //-- Polish e.g. "filipowicz" --// + result.append("TS", "FX"); + index += 4; + } else { + index++; + } + } + return index; + } + + /** + * Handles 'X' cases + */ + private int handleX(String value, + DoubleMetaphoneResult result, + int index) { + if (index == 0) { + result.append('S'); + index++; + } else { + if (!((index == value.length() - 1) && + (contains(value, index - 3, 3, "IAU", "EAU") || + contains(value, index - 2, 2, "AU", "OU")))) { + //-- French e.g. breaux --// + result.append("KS"); + } + index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; + } + return index; + } + + /** + * Handles 'Z' cases + */ + private int handleZ(String value, DoubleMetaphoneResult result, int index, + boolean slavoGermanic) { + if (charAt(value, index + 1) == 'H') { + //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// + result.append('J'); + index += 2; + } else { + if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { + result.append("S", "TS"); + } else { + result.append('S'); + } + index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; + } + return index; + } + + //-- BEGIN CONDITIONS --// + + /** + * Complex condition 0 for 'C' + */ + private boolean conditionC0(String value, int index) { + if (contains(value, index, 4, "CHIA")) { + return true; + } else if (index <= 1) { + return false; + } else if (isVowel(charAt(value, index - 2))) { + return false; + } else if (!contains(value, index - 1, 3, "ACH")) { + return false; + } else { + char c = charAt(value, index + 2); + return (c != 'I' && c != 'E') || + contains(value, index - 2, 6, "BACHER", "MACHER"); + } + } + + /** + * Complex condition 0 for 'CH' + */ + private boolean conditionCH0(String value, int index) { + if (index != 0) { + return false; + } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && + !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { + return false; + } else if (contains(value, 0, 5, "CHORE")) { + return false; + } else { + return true; + } + } + + /** + * Complex condition 1 for 'CH' + */ + private boolean conditionCH1(String value, int index) { + return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, + 3, "SCH")) || + contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || + contains(value, index + 2, 1, "T", "S") || + ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && + (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); + } + + /** + * Complex condition 0 for 'L' + */ + private boolean conditionL0(String value, int index) { + if (index == value.length() - 3 && + contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { + return true; + } else if ((contains(value, value.length() - 2, 2, "AS", "OS") || + contains(value, value.length() - 1, 1, "A", "O")) && + contains(value, index - 1, 4, "ALLE")) { + return true; + } else { + return false; + } + } + + /** + * Complex condition 0 for 'M' + */ + private boolean conditionM0(String value, int index) { + if (charAt(value, index + 1) == 'M') { + return true; + } + return contains(value, index - 1, 3, "UMB") && + ((index + 1) == value.length() - 1 || contains(value, + index + 2, 2, "ER")); + } + + //-- BEGIN HELPER FUNCTIONS --// + + /** + * Determines whether or not a value is of slavo-germanic orgin. A value is + * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. + */ + private boolean isSlavoGermanic(String value) { + return value.indexOf('W') > -1 || value.indexOf('K') > -1 || + value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; + } + + /** + * Determines whether or not a character is a vowel or not + */ + private boolean isVowel(char ch) { + return VOWELS.indexOf(ch) != -1; + } + + /** + * Determines whether or not the value starts with a silent letter. It will + * return true if the value starts with any of 'GN', 'KN', + * 'PN', 'WR' or 'PS'. + */ + private boolean isSilentStart(String value) { + boolean result = false; + for (String element : SILENT_START) { + if (value.startsWith(element)) { + result = true; + break; + } + } + return result; + } + + /** + * Cleans the input + */ + private String cleanInput(String input) { + if (input == null) { + return null; + } + input = input.trim(); + if (input.length() == 0) { + return null; + } + return input.toUpperCase(java.util.Locale.ENGLISH); + } + + /** + * Gets the character at index index if available, otherwise + * it returns Character.MIN_VALUE so that there is some sort + * of a default + */ + protected char charAt(String value, int index) { + if (index < 0 || index >= value.length()) { + return Character.MIN_VALUE; + } + return value.charAt(index); + } + + /** + * Shortcut method with 1 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria) { + return contains(value, start, length, + new String[] { criteria }); + } + + /** + * Shortcut method with 2 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2) { + return contains(value, start, length, + new String[] { criteria1, criteria2 }); + } + + /** + * Shortcut method with 3 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3 }); + } + + /** + * Shortcut method with 4 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4 }); + } + + /** + * Shortcut method with 5 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4, + String criteria5) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4, criteria5 }); + } + + /** + * Shortcut method with 6 criteria + */ + private static boolean contains(String value, int start, int length, + String criteria1, String criteria2, + String criteria3, String criteria4, + String criteria5, String criteria6) { + return contains(value, start, length, + new String[] { criteria1, criteria2, criteria3, + criteria4, criteria5, criteria6 }); + } + + /** + * Determines whether value contains any of the criteria starting at index start and + * matching up to length length + */ + protected static boolean contains(String value, int start, int length, + String[] criteria) { + boolean result = false; + if (start >= 0 && start + length <= value.length()) { + String target = value.substring(start, start + length); + + for (String element : criteria) { + if (target.equals(element)) { + result = true; + break; + } + } + } + return result; + } + + //-- BEGIN INNER CLASSES --// + + /** + * Inner class for storing results, since there is the optional alternate + * encoding. + */ + public class DoubleMetaphoneResult { + + private final StringBuffer primary = new StringBuffer(getMaxCodeLen()); + private final StringBuffer alternate = new StringBuffer(getMaxCodeLen()); + private final int maxLength; + + public DoubleMetaphoneResult(int maxLength) { + this.maxLength = maxLength; + } + + public void append(char value) { + appendPrimary(value); + appendAlternate(value); + } + + public void append(char primary, char alternate) { + appendPrimary(primary); + appendAlternate(alternate); + } + + public void appendPrimary(char value) { + if (this.primary.length() < this.maxLength) { + this.primary.append(value); + } + } + + public void appendAlternate(char value) { + if (this.alternate.length() < this.maxLength) { + this.alternate.append(value); + } + } + + public void append(String value) { + appendPrimary(value); + appendAlternate(value); + } + + public void append(String primary, String alternate) { + appendPrimary(primary); + appendAlternate(alternate); + } + + public void appendPrimary(String value) { + int addChars = this.maxLength - this.primary.length(); + if (value.length() <= addChars) { + this.primary.append(value); + } else { + this.primary.append(value.substring(0, addChars)); + } + } + + public void appendAlternate(String value) { + int addChars = this.maxLength - this.alternate.length(); + if (value.length() <= addChars) { + this.alternate.append(value); + } else { + this.alternate.append(value.substring(0, addChars)); + } + } + + public String getPrimary() { + return this.primary.toString(); + } + + public String getAlternate() { + return this.alternate.toString(); + } + + public boolean isComplete() { + return this.primary.length() >= this.maxLength && + this.alternate.length() >= this.maxLength; + } + } +} diff --git a/source/net/yacy/cora/language/phonetic/Metaphone.java b/source/net/yacy/cora/language/phonetic/Metaphone.java new file mode 100644 index 000000000..65f4f0716 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/Metaphone.java @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a Metaphone value. + *

+ * Initial Java implementation by William B. Brogden. December, 1997. + * Permission given by wbrogden for code to be used anywhere. + *

+ *

+ * Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p + * 39. + *

+ *

+ * Note, that this does not match the algorithm that ships with PHP, or the algorithm + * found in the Perl Text:Metaphone-1.96. + * They have had undocumented changes from the originally published algorithm. + * For more information, see CODEC-57. + *

+ * + * @author Apache Software Foundation + * @version $Id: Metaphone.java 1157192 2011-08-12 17:27:38Z ggregory $ + */ +public class Metaphone implements StringEncoder { + + /** + * Five values in the English language + */ + private static final String VOWELS = "AEIOU" ; + + /** + * Variable used in Metaphone algorithm + */ + private static final String FRONTV = "EIY" ; + + /** + * Variable used in Metaphone algorithm + */ + private static final String VARSON = "CSPTG" ; + + /** + * The max code length for metaphone is 4 + */ + private int maxCodeLen = 4 ; + + /** + * Creates an instance of the Metaphone encoder + */ + public Metaphone() { + super(); + } + + /** + * Find the metaphone value of a String. This is similar to the + * soundex algorithm, but better at finding similar sounding words. + * All input is converted to upper case. + * Limitations: Input format is expected to be a single ASCII word + * with only characters in the A - Z range, no punctuation or numbers. + * + * @param txt String to find the metaphone code for + * @return A metaphone code corresponding to the String supplied + */ + public String metaphone(String txt) { + boolean hard = false ; + if ((txt == null) || (txt.length() == 0)) { + return "" ; + } + // single character is itself + if (txt.length() == 1) { + return txt.toUpperCase(java.util.Locale.ENGLISH) ; + } + + char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ; + + StringBuffer local = new StringBuffer(40); // manipulate + StringBuffer code = new StringBuffer(10) ; // output + // handle initial 2 characters exceptions + switch(inwd[0]) { + case 'K' : + case 'G' : + case 'P' : /* looking for KN, etc*/ + if (inwd[1] == 'N') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'A': /* looking for AE */ + if (inwd[1] == 'E') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'W' : /* looking for WR or WH */ + if (inwd[1] == 'R') { // WR -> R + local.append(inwd, 1, inwd.length - 1); + break ; + } + if (inwd[1] == 'H') { + local.append(inwd, 1, inwd.length - 1); + local.setCharAt(0, 'W'); // WH -> W + } else { + local.append(inwd); + } + break; + case 'X' : /* initial X becomes S */ + inwd[0] = 'S'; + local.append(inwd); + break ; + default : + local.append(inwd); + } // now local has working string with initials fixed + + int wdsz = local.length(); + int n = 0 ; + + while ((code.length() < this.getMaxCodeLen()) && + (n < wdsz) ) { // max code size of 4 works well + char symb = local.charAt(n) ; + // remove duplicate letters except C + if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { + n++ ; + } else { // not dup + switch(symb) { + case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : + if (n == 0) { + code.append(symb); + } + break ; // only use vowel if leading char + case 'B' : + if ( isPreviousChar(local, n, 'M') && + isLastChar(wdsz, n) ) { // B is silent if word ends in MB + break; + } + code.append(symb); + break; + case 'C' : // lots of C special cases + /* discard if SCI, SCE or SCY */ + if ( isPreviousChar(local, n, 'S') && + !isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { + break; + } + if (regionMatch(local, n, "CIA")) { // "CIA" -> X + code.append('X'); + break; + } + if (!isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) { + code.append('S'); + break; // CI,CE,CY -> S + } + if (isPreviousChar(local, n, 'S') && + isNextChar(local, n, 'H') ) { // SCH->sk + code.append('K') ; + break ; + } + if (isNextChar(local, n, 'H')) { // detect CH + if ((n == 0) && + (wdsz >= 3) && + isVowel(local,2) ) { // CH consonant -> K consonant + code.append('K'); + } else { + code.append('X'); // CHvowel -> X + } + } else { + code.append('K'); + } + break ; + case 'D' : + if (!isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'G') && + (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J + code.append('J'); n += 2 ; + } else { + code.append('T'); + } + break ; + case 'G' : // GH silent at end or before consonant + if (isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'H')) { + break; + } + if (!isLastChar(wdsz, n + 1) && + isNextChar(local,n,'H') && + !isVowel(local,n+2)) { + break; + } + if ((n > 0) && + ( regionMatch(local, n, "GN") || + regionMatch(local, n, "GNED") ) ) { + break; // silent G + } + if (isPreviousChar(local, n, 'G')) { + // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true + hard = true ; + } else { + hard = false ; + } + if (!isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && + (!hard)) { + code.append('J'); + } else { + code.append('K'); + } + break ; + case 'H': + if (isLastChar(wdsz, n)) { + break ; // terminal H + } + if ((n > 0) && + (VARSON.indexOf(local.charAt(n - 1)) >= 0)) { + break; + } + if (isVowel(local,n+1)) { + code.append('H'); // Hvowel + } + break; + case 'F': + case 'J' : + case 'L' : + case 'M': + case 'N' : + case 'R' : + code.append(symb); + break; + case 'K' : + if (n > 0) { // not initial + if (!isPreviousChar(local, n, 'C')) { + code.append(symb); + } + } else { + code.append(symb); // initial K + } + break ; + case 'P' : + if (isNextChar(local,n,'H')) { + // PH -> F + code.append('F'); + } else { + code.append(symb); + } + break ; + case 'Q' : + code.append('K'); + break; + case 'S' : + if (regionMatch(local,n,"SH") || + regionMatch(local,n,"SIO") || + regionMatch(local,n,"SIA")) { + code.append('X'); + } else { + code.append('S'); + } + break; + case 'T' : + if (regionMatch(local,n,"TIA") || + regionMatch(local,n,"TIO")) { + code.append('X'); + break; + } + if (regionMatch(local,n,"TCH")) { + // Silent if in "TCH" + break; + } + // substitute numeral 0 for TH (resembles theta after all) + if (regionMatch(local,n,"TH")) { + code.append('0'); + } else { + code.append('T'); + } + break ; + case 'V' : + code.append('F'); break ; + case 'W' : case 'Y' : // silent if not followed by vowel + if (!isLastChar(wdsz,n) && + isVowel(local,n+1)) { + code.append(symb); + } + break ; + case 'X' : + code.append('K'); code.append('S'); + break ; + case 'Z' : + code.append('S'); break ; + } // end switch + n++ ; + } // end else from symb != 'C' + if (code.length() > this.getMaxCodeLen()) { + code.setLength(this.getMaxCodeLen()); + } + } + return code.toString(); + } + + private boolean isVowel(StringBuffer string, int index) { + return VOWELS.indexOf(string.charAt(index)) >= 0; + } + + private boolean isPreviousChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index > 0 && + index < string.length() ) { + matches = string.charAt(index - 1) == c; + } + return matches; + } + + private boolean isNextChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index >= 0 && + index < string.length() - 1 ) { + matches = string.charAt(index + 1) == c; + } + return matches; + } + + private boolean regionMatch(StringBuffer string, int index, String test) { + boolean matches = false; + if( index >= 0 && + (index + test.length() - 1) < string.length() ) { + String substring = string.substring( index, index + test.length()); + matches = substring.equals( test ); + } + return matches; + } + + private boolean isLastChar(int wdsz, int n) { + return n + 1 == wdsz; + } + + + /** + * Encodes an Object using the metaphone algorithm. This method + * is provided in order to satisfy the requirements of the + * Encoder interface, and will throw an EncoderException if the + * supplied object is not of type java.lang.String. + * + * @param pObject Object to encode + * @return An object (or type java.lang.String) containing the + * metaphone code which corresponds to the String supplied. + * @throws EncoderException if the parameter supplied is not + * of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); + } + return metaphone((String) pObject); + } + + /** + * Encodes a String using the Metaphone algorithm. + * + * @param pString String object to encode + * @return The metaphone code corresponding to the String supplied + */ + public String encode(String pString) { + return metaphone(pString); + } + + /** + * Tests is the metaphones of two strings are identical. + * + * @param str1 First of two strings to compare + * @param str2 Second of two strings to compare + * @return true if the metaphones of these strings are identical, + * false otherwise. + */ + public boolean isMetaphoneEqual(String str1, String str2) { + return metaphone(str1).equals(metaphone(str2)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { return this.maxCodeLen; } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } + +} diff --git a/source/net/yacy/cora/language/phonetic/Phonetic.java b/source/net/yacy/cora/language/phonetic/Phonetic.java new file mode 100644 index 000000000..f59bbd137 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/Phonetic.java @@ -0,0 +1,73 @@ +/** + * Phonetic + * Copyright 201 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 13.12.2011 at http://yacy.net + * + * $LastChangedDate$ + * $LastChangedRevision$ + * $LastChangedBy$ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.language.phonetic; + +public class Phonetic { + + public enum Encoder { + SOUNDEX("Soundex", ""), + COLONE_PHONETIC("Koelner Phonetik", "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik"), + METAPHONE("Metaphone", ""), + DOUBLE_METAPHONE("Double Metaphone", ""), + NONE("", ""); + + final String printName; + final String infoUrl; + + private Encoder(final String printName, final String infoUrl) { + this.printName = printName; + this.infoUrl = infoUrl; + } + } + + private static final Soundex soundexEncoder = new Soundex(); + private static final Metaphone metaphoneEncoder = new Metaphone(); + private static final DoubleMetaphone doubleMetaphoneEncoder = new DoubleMetaphone(); + private static final ColognePhonetic colognePhonetic = new ColognePhonetic(); + + public static String encode(final Encoder encoder, final String s) { + try { + if (encoder == Encoder.SOUNDEX) return soundexEncoder.encode(s); + if (encoder == Encoder.COLONE_PHONETIC) return colognePhonetic.encode(s); + if (encoder == Encoder.METAPHONE) return metaphoneEncoder.encode(s); + if (encoder == Encoder.DOUBLE_METAPHONE) return doubleMetaphoneEncoder.encode(s); + return s; + } catch (Throwable e) { + // some encoders do not work with non-ASCII charachters and throw an exception + return s; + } + } + + public static void main(String[] args) { + for (Encoder encoder: Encoder.values()) { + for (String s: args) { + System.out.print(Phonetic.encode(encoder, s)); + System.out.print(" "); + } + System.out.println(); + } + } + +} diff --git a/source/net/yacy/cora/language/phonetic/Soundex.java b/source/net/yacy/cora/language/phonetic/Soundex.java new file mode 100644 index 000000000..7581aa944 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/Soundex.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a + * general purpose scheme to find word with similar phonemes. + * + * @author Apache Software Foundation + * @version $Id: Soundex.java 1201529 2011-11-13 21:57:16Z ggregory $ + */ +public class Soundex implements StringEncoder { + + /** + * This is a default mapping of the 26 letters used in US English. A value of 0 for a letter position + * means do not encode. + *

+ * (This constant is provided as both an implementation convenience and to allow Javadoc to pick + * up the value for the constant values page.) + *

+ * + * @see #US_ENGLISH_MAPPING + */ + public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; + + /** + * This is a default mapping of the 26 letters used in US English. A value of 0 for a letter position + * means do not encode. + * + * @see Soundex#Soundex(char[]) + */ + private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); + + /** + * An instance of Soundex using the US_ENGLISH_MAPPING mapping. + * + * @see #US_ENGLISH_MAPPING + */ + public static final Soundex US_ENGLISH = new Soundex(); + + /** + * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + * letter is mapped. This implementation contains a default map for US_ENGLISH + */ + private final char[] soundexMapping; + + /** + * Creates an instance using US_ENGLISH_MAPPING + * + * @see Soundex#Soundex(char[]) + * @see Soundex#US_ENGLISH_MAPPING + */ + public Soundex() { + this.soundexMapping = US_ENGLISH_MAPPING; + } + + /** + * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized + * mapping for a non-Western character set. + * + * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + * letter is mapped. This implementation contains a default map for US_ENGLISH + * + * @param mapping + * Mapping array to use when finding the corresponding code for a given character + */ + public Soundex(char[] mapping) { + this.soundexMapping = new char[mapping.length]; + System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); + } + + /** + * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, + * and/or possibly provide an internationalized mapping for a non-Western character set. + * + * @param mapping + * Mapping string to use when finding the corresponding code for a given character + * @since 1.4 + */ + public Soundex(String mapping) { + this.soundexMapping = mapping.toCharArray(); + } + + /** + * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This + * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or + * identical values. + * + * @param s1 + * A String that will be encoded and compared. + * @param s2 + * A String that will be encoded and compared. + * @return The number of characters in the two encoded Strings that are the same from 0 to 4. + * + * @see SoundexUtils#difference(StringEncoder,String,String) + * @see MS + * T-SQL DIFFERENCE + * + * @throws EncoderException + * if an error occurs encoding one of the strings + * @since 1.3 + */ + public int difference(String s1, String s2) throws EncoderException { + return difference(this, s1, s2); + } + + /** + * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of + * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. + * + * @param pObject + * Object to encode + * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String + * supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + * @throws IllegalArgumentException + * if a character is not mapped + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); + } + return soundex((String) pObject); + } + + /** + * Encodes a String using the soundex algorithm. + * + * @param pString + * A String object to encode + * @return A Soundex code corresponding to the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + public String encode(String pString) { + return soundex(pString); + } + + /** + * Used internally by the SoundEx algorithm. + * + * Consonants from the same code group separated by W or H are treated as one. + * + * @param str + * the cleaned working string to encode (in upper case). + * @param index + * the character position to encode + * @return Mapping code for a particular character + * @throws IllegalArgumentException + * if the character is not mapped + */ + private char getMappingCode(String str, int index) { + // map() throws IllegalArgumentException + char mappedChar = this.map(str.charAt(index)); + // HW rule check + if (index > 1 && mappedChar != '0') { + char hwChar = str.charAt(index - 1); + if ('H' == hwChar || 'W' == hwChar) { + char preHWChar = str.charAt(index - 2); + char firstCode = this.map(preHWChar); + if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { + return 0; + } + } + } + return mappedChar; + } + + /** + * Returns the soundex mapping. + * + * @return soundexMapping. + */ + private char[] getSoundexMapping() { + return this.soundexMapping; + } + + /** + * Maps the given upper-case character to its Soundex code. + * + * @param ch + * An upper-case character. + * @return A Soundex code. + * @throws IllegalArgumentException + * Thrown if ch is not mapped. + */ + private char map(char ch) { + int index = ch - 'A'; + if (index < 0 || index >= this.getSoundexMapping().length) { + throw new IllegalArgumentException("The character is not mapped: " + ch); + } + return this.getSoundexMapping()[index]; + } + + /** + * Retrieves the Soundex code for a given String object. + * + * @param str + * String to encode using the Soundex algorithm + * @return A soundex code for the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + public String soundex(String str) { + if (str == null) { + return null; + } + str = clean(str); + if (str.length() == 0) { + return str; + } + char out[] = {'0', '0', '0', '0'}; + char last, mapped; + int incount = 1, count = 1; + out[0] = str.charAt(0); + // getMappingCode() throws IllegalArgumentException + last = getMappingCode(str, 0); + while ((incount < str.length()) && (count < out.length)) { + mapped = getMappingCode(str, incount++); + if (mapped != 0) { + if ((mapped != '0') && (mapped != last)) { + out[count++] = mapped; + } + last = mapped; + } + } + return new String(out); + } + + + /** + * Cleans up the input string before Soundex processing by only returning + * upper case letters. + * + * @param str + * The String to clean. + * @return A clean String. + */ + static String clean(String str) { + if (str == null || str.length() == 0) { + return str; + } + int len = str.length(); + char[] chars = new char[len]; + int count = 0; + for (int i = 0; i < len; i++) { + if (Character.isLetter(str.charAt(i))) { + chars[count++] = str.charAt(i); + } + } + if (count == len) { + return str.toUpperCase(java.util.Locale.ENGLISH); + } + return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH); + } + + /** + * Encodes the Strings and returns the number of characters in the two + * encoded Strings that are the same. + * + * + * @param encoder + * The encoder to use to encode the Strings. + * @param s1 + * A String that will be encoded and compared. + * @param s2 + * A String that will be encoded and compared. + * @return The number of characters in the two Soundex encoded Strings that + * are the same. + * + * @see #differenceEncoded(String,String) + * @see + * MS T-SQL DIFFERENCE + * + * @throws EncoderException + * if an error occurs encoding one of the strings + */ + static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException { + return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); + } + + /** + * Returns the number of characters in the two Soundex encoded Strings that + * are the same. + * + * + * @param es1 + * An encoded String. + * @param es2 + * An encoded String. + * @return The number of characters in the two Soundex encoded Strings that + * are the same. + * + * @see + * MS T-SQL DIFFERENCE + */ + static int differenceEncoded(String es1, String es2) { + + if (es1 == null || es2 == null) { + return 0; + } + int lengthToMatch = Math.min(es1.length(), es2.length()); + int diff = 0; + for (int i = 0; i < lengthToMatch; i++) { + if (es1.charAt(i) == es2.charAt(i)) { + diff++; + } + } + return diff; + } +} diff --git a/source/net/yacy/cora/plugin/ClassProvider.java b/source/net/yacy/cora/plugin/ClassProvider.java index 31cc58a02..988631067 100644 --- a/source/net/yacy/cora/plugin/ClassProvider.java +++ b/source/net/yacy/cora/plugin/ClassProvider.java @@ -1,3 +1,27 @@ +/** + * ClassProvider + * Copyright 201 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 13.12.2011 at http://yacy.net + * + * $LastChangedDate$ + * $LastChangedRevision$ + * $LastChangedBy$ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + package net.yacy.cora.plugin; import java.io.File;