diff --git a/source/net/yacy/cora/language/phonetic/ColognePhonetic.java b/source/net/yacy/cora/language/phonetic/ColognePhonetic.java new file mode 100644 index 000000000..be37e2035 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/ColognePhonetic.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import java.util.Locale; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + *
+ * Encodes a string into a Cologne Phonetic value. + *
+ *+ * Implements the Kölner Phonetik (Cologne Phonetic) + * algorithm issued by Hans Joachim Postel in 1969. + *
+ * + *+ * The Kölner Phonetik is a phonetic algorithm which is optimized for the German language. It is related to the + * well-known soundex algorithm. + *
+ * + *Letter | + *Context | + *Code | + *
---|---|---|
A, E, I, J, O, U, Y | + *+ * | 0 | + *
H | + *+ * | - | + *
B | + *+ * | 1 | + *
P | + *not before H | + * + *|
D, T | + *not before C, S, Z | + *2 | + *
F, V, W | + *+ * | 3 | + *
P | + *before H | + *|
G, K, Q | + *+ * | 4 | + *
C | + *at onset before A, H, K, L, O, Q, R, U, X | + * + *|
before A, H, K, O, Q, U, X except after S, Z | + *||
X | + *not after C, K, Q | + *48 | + *
L | + *+ * + * | 5 | + *
M, N | + *+ * | 6 | + *
R | + *+ * | 7 | + *
S, Z | + *+ * | 8 | + *
C | + *after S, Z | + *|
at onset except before A, H, K, L, O, Q, R, U, X | + *||
not before A, H, K, O, Q, U, X | + *||
D, T | + *before C, S, Z | + *|
X | + *after C, K, Q | + *
+ * (Source: Wikipedia (de): + * Kölner Phonetik -- Buchstabencodes) + *
+ * + *+ * Implements the Kölner Phonetik algorithm. + *
+ *+ * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. + *
+ * + * @param text + * @return the corresponding encoding according to the Kölner Phonetik algorithm + */ + public String colognePhonetic(String text) { + if (text == null) { + return null; + } + + text = preprocess(text); + + CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2); + CologneInputBuffer input = new CologneInputBuffer(text.toCharArray()); + + char nextChar; + + char lastChar = '-'; + char lastCode = '/'; + char code; + char chr; + + int rightLength = input.length(); + + while (rightLength > 0) { + chr = input.removeNext(); + + if ((rightLength = input.length()) > 0) { + nextChar = input.getNextChar(); + } else { + nextChar = '-'; + } + + if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) { + code = '0'; + } else if (chr == 'H' || chr < 'A' || chr > 'Z') { + if (lastCode == '/') { + continue; + } + code = '-'; + } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) { + code = '1'; + } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) { + code = '2'; + } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) { + code = '3'; + } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) { + code = '4'; + } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) { + code = '4'; + input.addLeft('S'); + rightLength++; + } else if (chr == 'S' || chr == 'Z') { + code = '8'; + } else if (chr == 'C') { + if (lastCode == '/') { + if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) { + code = '4'; + } else { + code = '8'; + } + } else { + if (arrayContains(new char[]{'S', 'Z'}, lastChar) || + !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) { + code = '8'; + } else { + code = '4'; + } + } + } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) { + code = '8'; + } else if (chr == 'R') { + code = '7'; + } else if (chr == 'L') { + code = '5'; + } else if (chr == 'M' || chr == 'N') { + code = '6'; + } else { + code = chr; + } + + if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) { + output.addRight(code); + } + + lastChar = chr; + lastCode = code; + } + return output.toString(); + } + + public Object encode(Object object) throws EncoderException { + if (!(object instanceof String)) { + throw new EncoderException("This method's parameter was expected to be of the type " + + String.class.getName() + + ". But actually it was of the type " + + object.getClass().getName() + + "."); + } + return encode((String) object); + } + + public String encode(String text) { + return colognePhonetic(text); + } + + public boolean isEncodeEqual(String text1, String text2) { + return colognePhonetic(text1).equals(colognePhonetic(text2)); + } + + /** + * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}. + */ + private String preprocess(String text) { + text = text.toUpperCase(Locale.GERMAN); + + char[] chrs = text.toCharArray(); + + for (int index = 0; index < chrs.length; index++) { + if (chrs[index] > 'Z') { + for (char[] element : PREPROCESS_MAP) { + if (chrs[index] == element[0]) { + chrs[index] = element[1]; + break; + } + } + } + } + return new String(chrs); + } +} diff --git a/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java b/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java new file mode 100644 index 000000000..1ccbf73c2 --- /dev/null +++ b/source/net/yacy/cora/language/phonetic/DoubleMetaphone.java @@ -0,0 +1,1105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.language.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a double metaphone value. + * This Implementation is based on the algorithm by Lawrence Philips. + *obj
is a String
(like Metaphone
).
+ *
+ * @param obj Object to encode (should be of type String)
+ * @return An encoded Object (will be of type String)
+ * @throws EncoderException encode parameter is not of type String
+ */
+ public Object encode(Object obj) throws EncoderException {
+ if (!(obj instanceof String)) {
+ throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
+ }
+ return doubleMetaphone((String) obj);
+ }
+
+ /**
+ * Encode the value using DoubleMetaphone.
+ *
+ * @param value String to encode
+ * @return An encoded String
+ */
+ public String encode(String value) {
+ return doubleMetaphone(value);
+ }
+
+ /**
+ * Check if the Double Metaphone values of two String
values
+ * are equal.
+ *
+ * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
+ * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
+ * @return true
if the encoded String
s are equal;
+ * false
otherwise.
+ * @see #isDoubleMetaphoneEqual(String,String,boolean)
+ */
+ public boolean isDoubleMetaphoneEqual(String value1, String value2) {
+ return isDoubleMetaphoneEqual(value1, value2, false);
+ }
+
+ /**
+ * Check if the Double Metaphone values of two String
values
+ * are equal, optionally using the alternate value.
+ *
+ * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
+ * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
+ * @param alternate use the alternate value if true
.
+ * @return true
if the encoded String
s are equal;
+ * false
otherwise.
+ */
+ public boolean isDoubleMetaphoneEqual(String value1,
+ String value2,
+ boolean alternate) {
+ return doubleMetaphone(value1, alternate).equals(doubleMetaphone
+ (value2, alternate));
+ }
+
+ /**
+ * Returns the maxCodeLen.
+ * @return int
+ */
+ public int getMaxCodeLen() {
+ return this.maxCodeLen;
+ }
+
+ /**
+ * Sets the maxCodeLen.
+ * @param maxCodeLen The maxCodeLen to set
+ */
+ public void setMaxCodeLen(int maxCodeLen) {
+ this.maxCodeLen = maxCodeLen;
+ }
+
+ //-- BEGIN HANDLERS --//
+
+ /**
+ * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
+ */
+ private int handleAEIOUY(DoubleMetaphoneResult result, int
+ index) {
+ if (index == 0) {
+ result.append('A');
+ }
+ return index + 1;
+ }
+
+ /**
+ * Handles 'C' cases
+ */
+ private int handleC(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (conditionC0(value, index)) { // very confusing, moved out
+ result.append('K');
+ index += 2;
+ } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
+ result.append('S');
+ index += 2;
+ } else if (contains(value, index, 2, "CH")) {
+ index = handleCH(value, result, index);
+ } else if (contains(value, index, 2, "CZ") &&
+ !contains(value, index - 2, 4, "WICZ")) {
+ //-- "Czerny" --//
+ result.append('S', 'X');
+ index += 2;
+ } else if (contains(value, index + 1, 3, "CIA")) {
+ //-- "focaccia" --//
+ result.append('X');
+ index += 3;
+ } else if (contains(value, index, 2, "CC") &&
+ !(index == 1 && charAt(value, 0) == 'M')) {
+ //-- double "cc" but not "McClelland" --//
+ return handleCC(value, result, index);
+ } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
+ result.append('K');
+ index += 2;
+ } else if (contains(value, index, 2, "CI", "CE", "CY")) {
+ //-- Italian vs. English --//
+ if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
+ result.append('S', 'X');
+ } else {
+ result.append('S');
+ }
+ index += 2;
+ } else {
+ result.append('K');
+ if (contains(value, index + 1, 2, " C", " Q", " G")) {
+ //-- Mac Caffrey, Mac Gregor --//
+ index += 3;
+ } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
+ !contains(value, index + 1, 2, "CE", "CI")) {
+ index += 2;
+ } else {
+ index++;
+ }
+ }
+
+ return index;
+ }
+
+ /**
+ * Handles 'CC' cases
+ */
+ private int handleCC(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (contains(value, index + 2, 1, "I", "E", "H") &&
+ !contains(value, index + 2, 2, "HU")) {
+ //-- "bellocchio" but not "bacchus" --//
+ if ((index == 1 && charAt(value, index - 1) == 'A') ||
+ contains(value, index - 1, 5, "UCCEE", "UCCES")) {
+ //-- "accident", "accede", "succeed" --//
+ result.append("KS");
+ } else {
+ //-- "bacci", "bertucci", other Italian --//
+ result.append('X');
+ }
+ index += 3;
+ } else { // Pierce's rule
+ result.append('K');
+ index += 2;
+ }
+
+ return index;
+ }
+
+ /**
+ * Handles 'CH' cases
+ */
+ private int handleCH(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
+ result.append('K', 'X');
+ return index + 2;
+ } else if (conditionCH0(value, index)) {
+ //-- Greek roots ("chemistry", "chorus", etc.) --//
+ result.append('K');
+ return index + 2;
+ } else if (conditionCH1(value, index)) {
+ //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
+ result.append('K');
+ return index + 2;
+ } else {
+ if (index > 0) {
+ if (contains(value, 0, 2, "MC")) {
+ result.append('K');
+ } else {
+ result.append('X', 'K');
+ }
+ } else {
+ result.append('X');
+ }
+ return index + 2;
+ }
+ }
+
+ /**
+ * Handles 'D' cases
+ */
+ private int handleD(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (contains(value, index, 2, "DG")) {
+ //-- "Edge" --//
+ if (contains(value, index + 2, 1, "I", "E", "Y")) {
+ result.append('J');
+ index += 3;
+ //-- "Edgar" --//
+ } else {
+ result.append("TK");
+ index += 2;
+ }
+ } else if (contains(value, index, 2, "DT", "DD")) {
+ result.append('T');
+ index += 2;
+ } else {
+ result.append('T');
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'G' cases
+ */
+ private int handleG(String value,
+ DoubleMetaphoneResult result,
+ int index,
+ boolean slavoGermanic) {
+ if (charAt(value, index + 1) == 'H') {
+ index = handleGH(value, result, index);
+ } else if (charAt(value, index + 1) == 'N') {
+ if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
+ result.append("KN", "N");
+ } else if (!contains(value, index + 2, 2, "EY") &&
+ charAt(value, index + 1) != 'Y' && !slavoGermanic) {
+ result.append("N", "KN");
+ } else {
+ result.append("KN");
+ }
+ index = index + 2;
+ } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
+ result.append("KL", "L");
+ index += 2;
+ } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
+ //-- -ges-, -gep-, -gel-, -gie- at beginning --//
+ result.append('K', 'J');
+ index += 2;
+ } else if ((contains(value, index + 1, 2, "ER") ||
+ charAt(value, index + 1) == 'Y') &&
+ !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
+ !contains(value, index - 1, 1, "E", "I") &&
+ !contains(value, index - 1, 3, "RGY", "OGY")) {
+ //-- -ger-, -gy- --//
+ result.append('K', 'J');
+ index += 2;
+ } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
+ contains(value, index - 1, 4, "AGGI", "OGGI")) {
+ //-- Italian "biaggi" --//
+ if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
+ //-- obvious germanic --//
+ result.append('K');
+ } else if (contains(value, index + 1, 3, "IER")) {
+ result.append('J');
+ } else {
+ result.append('J', 'K');
+ }
+ index += 2;
+ } else if (charAt(value, index + 1) == 'G') {
+ index += 2;
+ result.append('K');
+ } else {
+ index++;
+ result.append('K');
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'GH' cases
+ */
+ private int handleGH(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (index > 0 && !isVowel(charAt(value, index - 1))) {
+ result.append('K');
+ index += 2;
+ } else if (index == 0) {
+ if (charAt(value, index + 2) == 'I') {
+ result.append('J');
+ } else {
+ result.append('K');
+ }
+ index += 2;
+ } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
+ (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
+ (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
+ //-- Parker's rule (with some further refinements) - "hugh"
+ index += 2;
+ } else {
+ if (index > 2 && charAt(value, index - 1) == 'U' &&
+ contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
+ //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
+ result.append('F');
+ } else if (index > 0 && charAt(value, index - 1) != 'I') {
+ result.append('K');
+ }
+ index += 2;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'H' cases
+ */
+ private int handleH(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ //-- only keep if first & before vowel or between 2 vowels --//
+ if ((index == 0 || isVowel(charAt(value, index - 1))) &&
+ isVowel(charAt(value, index + 1))) {
+ result.append('H');
+ index += 2;
+ //-- also takes car of "HH" --//
+ } else {
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'J' cases
+ */
+ private int handleJ(String value, DoubleMetaphoneResult result, int index,
+ boolean slavoGermanic) {
+ if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
+ //-- obvious Spanish, "Jose", "San Jacinto" --//
+ if ((index == 0 && (charAt(value, index + 4) == ' ') ||
+ value.length() == 4) || contains(value, 0, 4, "SAN ")) {
+ result.append('H');
+ } else {
+ result.append('J', 'H');
+ }
+ index++;
+ } else {
+ if (index == 0 && !contains(value, index, 4, "JOSE")) {
+ result.append('J', 'A');
+ } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
+ (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
+ result.append('J', 'H');
+ } else if (index == value.length() - 1) {
+ result.append('J', ' ');
+ } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
+ result.append('J');
+ }
+
+ if (charAt(value, index + 1) == 'J') {
+ index += 2;
+ } else {
+ index++;
+ }
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'L' cases
+ */
+ private int handleL(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (charAt(value, index + 1) == 'L') {
+ if (conditionL0(value, index)) {
+ result.appendPrimary('L');
+ } else {
+ result.append('L');
+ }
+ index += 2;
+ } else {
+ index++;
+ result.append('L');
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'P' cases
+ */
+ private int handleP(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (charAt(value, index + 1) == 'H') {
+ result.append('F');
+ index += 2;
+ } else {
+ result.append('P');
+ index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'R' cases
+ */
+ private int handleR(String value,
+ DoubleMetaphoneResult result,
+ int index,
+ boolean slavoGermanic) {
+ if (index == value.length() - 1 && !slavoGermanic &&
+ contains(value, index - 2, 2, "IE") &&
+ !contains(value, index - 4, 2, "ME", "MA")) {
+ result.appendAlternate('R');
+ } else {
+ result.append('R');
+ }
+ return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
+ }
+
+ /**
+ * Handles 'S' cases
+ */
+ private int handleS(String value,
+ DoubleMetaphoneResult result,
+ int index,
+ boolean slavoGermanic) {
+ if (contains(value, index - 1, 3, "ISL", "YSL")) {
+ //-- special cases "island", "isle", "carlisle", "carlysle" --//
+ index++;
+ } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
+ //-- special case "sugar-" --//
+ result.append('X', 'S');
+ index++;
+ } else if (contains(value, index, 2, "SH")) {
+ if (contains(value, index + 1, 4,
+ "HEIM", "HOEK", "HOLM", "HOLZ")) {
+ //-- germanic --//
+ result.append('S');
+ } else {
+ result.append('X');
+ }
+ index += 2;
+ } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
+ //-- Italian and Armenian --//
+ if (slavoGermanic) {
+ result.append('S');
+ } else {
+ result.append('S', 'X');
+ }
+ index += 3;
+ } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
+ //-- german & anglicisations, e.g. "smith" match "schmidt" //
+ // "snider" match "schneider" --//
+ //-- also, -sz- in slavic language altho in hungarian it //
+ // is pronounced "s" --//
+ result.append('S', 'X');
+ index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
+ } else if (contains(value, index, 2, "SC")) {
+ index = handleSC(value, result, index);
+ } else {
+ if (index == value.length() - 1 && contains(value, index - 2,
+ 2, "AI", "OI")){
+ //-- french e.g. "resnais", "artois" --//
+ result.appendAlternate('S');
+ } else {
+ result.append('S');
+ }
+ index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'SC' cases
+ */
+ private int handleSC(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (charAt(value, index + 2) == 'H') {
+ //-- Schlesinger's rule --//
+ if (contains(value, index + 3,
+ 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
+ //-- Dutch origin, e.g. "school", "schooner" --//
+ if (contains(value, index + 3, 2, "ER", "EN")) {
+ //-- "schermerhorn", "schenker" --//
+ result.append("X", "SK");
+ } else {
+ result.append("SK");
+ }
+ } else {
+ if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
+ result.append('X', 'S');
+ } else {
+ result.append('X');
+ }
+ }
+ } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
+ result.append('S');
+ } else {
+ result.append("SK");
+ }
+ return index + 3;
+ }
+
+ /**
+ * Handles 'T' cases
+ */
+ private int handleT(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (contains(value, index, 4, "TION")) {
+ result.append('X');
+ index += 3;
+ } else if (contains(value, index, 3, "TIA", "TCH")) {
+ result.append('X');
+ index += 3;
+ } else if (contains(value, index, 2, "TH") || contains(value, index,
+ 3, "TTH")) {
+ if (contains(value, index + 2, 2, "OM", "AM") ||
+ //-- special case "thomas", "thames" or germanic --//
+ contains(value, 0, 4, "VAN ", "VON ") ||
+ contains(value, 0, 3, "SCH")) {
+ result.append('T');
+ } else {
+ result.append('0', 'T');
+ }
+ index += 2;
+ } else {
+ result.append('T');
+ index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'W' cases
+ */
+ private int handleW(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (contains(value, index, 2, "WR")) {
+ //-- can also be in middle of word --//
+ result.append('R');
+ index += 2;
+ } else {
+ if (index == 0 && (isVowel(charAt(value, index + 1)) ||
+ contains(value, index, 2, "WH"))) {
+ if (isVowel(charAt(value, index + 1))) {
+ //-- Wasserman should match Vasserman --//
+ result.append('A', 'F');
+ } else {
+ //-- need Uomo to match Womo --//
+ result.append('A');
+ }
+ index++;
+ } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
+ contains(value, index - 1,
+ 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
+ contains(value, 0, 3, "SCH")) {
+ //-- Arnow should match Arnoff --//
+ result.appendAlternate('F');
+ index++;
+ } else if (contains(value, index, 4, "WICZ", "WITZ")) {
+ //-- Polish e.g. "filipowicz" --//
+ result.append("TS", "FX");
+ index += 4;
+ } else {
+ index++;
+ }
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'X' cases
+ */
+ private int handleX(String value,
+ DoubleMetaphoneResult result,
+ int index) {
+ if (index == 0) {
+ result.append('S');
+ index++;
+ } else {
+ if (!((index == value.length() - 1) &&
+ (contains(value, index - 3, 3, "IAU", "EAU") ||
+ contains(value, index - 2, 2, "AU", "OU")))) {
+ //-- French e.g. breaux --//
+ result.append("KS");
+ }
+ index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ /**
+ * Handles 'Z' cases
+ */
+ private int handleZ(String value, DoubleMetaphoneResult result, int index,
+ boolean slavoGermanic) {
+ if (charAt(value, index + 1) == 'H') {
+ //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
+ result.append('J');
+ index += 2;
+ } else {
+ if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
+ result.append("S", "TS");
+ } else {
+ result.append('S');
+ }
+ index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
+ }
+ return index;
+ }
+
+ //-- BEGIN CONDITIONS --//
+
+ /**
+ * Complex condition 0 for 'C'
+ */
+ private boolean conditionC0(String value, int index) {
+ if (contains(value, index, 4, "CHIA")) {
+ return true;
+ } else if (index <= 1) {
+ return false;
+ } else if (isVowel(charAt(value, index - 2))) {
+ return false;
+ } else if (!contains(value, index - 1, 3, "ACH")) {
+ return false;
+ } else {
+ char c = charAt(value, index + 2);
+ return (c != 'I' && c != 'E') ||
+ contains(value, index - 2, 6, "BACHER", "MACHER");
+ }
+ }
+
+ /**
+ * Complex condition 0 for 'CH'
+ */
+ private boolean conditionCH0(String value, int index) {
+ if (index != 0) {
+ return false;
+ } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
+ !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
+ return false;
+ } else if (contains(value, 0, 5, "CHORE")) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ /**
+ * Complex condition 1 for 'CH'
+ */
+ private boolean conditionCH1(String value, int index) {
+ return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
+ 3, "SCH")) ||
+ contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
+ contains(value, index + 2, 1, "T", "S") ||
+ ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
+ (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
+ }
+
+ /**
+ * Complex condition 0 for 'L'
+ */
+ private boolean conditionL0(String value, int index) {
+ if (index == value.length() - 3 &&
+ contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
+ return true;
+ } else if ((contains(value, value.length() - 2, 2, "AS", "OS") ||
+ contains(value, value.length() - 1, 1, "A", "O")) &&
+ contains(value, index - 1, 4, "ALLE")) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Complex condition 0 for 'M'
+ */
+ private boolean conditionM0(String value, int index) {
+ if (charAt(value, index + 1) == 'M') {
+ return true;
+ }
+ return contains(value, index - 1, 3, "UMB") &&
+ ((index + 1) == value.length() - 1 || contains(value,
+ index + 2, 2, "ER"));
+ }
+
+ //-- BEGIN HELPER FUNCTIONS --//
+
+ /**
+ * Determines whether or not a value is of slavo-germanic orgin. A value is
+ * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
+ */
+ private boolean isSlavoGermanic(String value) {
+ return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
+ value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
+ }
+
+ /**
+ * Determines whether or not a character is a vowel or not
+ */
+ private boolean isVowel(char ch) {
+ return VOWELS.indexOf(ch) != -1;
+ }
+
+ /**
+ * Determines whether or not the value starts with a silent letter. It will
+ * return true
if the value starts with any of 'GN', 'KN',
+ * 'PN', 'WR' or 'PS'.
+ */
+ private boolean isSilentStart(String value) {
+ boolean result = false;
+ for (String element : SILENT_START) {
+ if (value.startsWith(element)) {
+ result = true;
+ break;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Cleans the input
+ */
+ private String cleanInput(String input) {
+ if (input == null) {
+ return null;
+ }
+ input = input.trim();
+ if (input.length() == 0) {
+ return null;
+ }
+ return input.toUpperCase(java.util.Locale.ENGLISH);
+ }
+
+ /**
+ * Gets the character at index index
if available, otherwise
+ * it returns Character.MIN_VALUE
so that there is some sort
+ * of a default
+ */
+ protected char charAt(String value, int index) {
+ if (index < 0 || index >= value.length()) {
+ return Character.MIN_VALUE;
+ }
+ return value.charAt(index);
+ }
+
+ /**
+ * Shortcut method with 1 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria) {
+ return contains(value, start, length,
+ new String[] { criteria });
+ }
+
+ /**
+ * Shortcut method with 2 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria1, String criteria2) {
+ return contains(value, start, length,
+ new String[] { criteria1, criteria2 });
+ }
+
+ /**
+ * Shortcut method with 3 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria1, String criteria2,
+ String criteria3) {
+ return contains(value, start, length,
+ new String[] { criteria1, criteria2, criteria3 });
+ }
+
+ /**
+ * Shortcut method with 4 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria1, String criteria2,
+ String criteria3, String criteria4) {
+ return contains(value, start, length,
+ new String[] { criteria1, criteria2, criteria3,
+ criteria4 });
+ }
+
+ /**
+ * Shortcut method with 5 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria1, String criteria2,
+ String criteria3, String criteria4,
+ String criteria5) {
+ return contains(value, start, length,
+ new String[] { criteria1, criteria2, criteria3,
+ criteria4, criteria5 });
+ }
+
+ /**
+ * Shortcut method with 6 criteria
+ */
+ private static boolean contains(String value, int start, int length,
+ String criteria1, String criteria2,
+ String criteria3, String criteria4,
+ String criteria5, String criteria6) {
+ return contains(value, start, length,
+ new String[] { criteria1, criteria2, criteria3,
+ criteria4, criteria5, criteria6 });
+ }
+
+ /**
+ * Determines whether value
contains any of the criteria starting at index start
and
+ * matching up to length length
+ */
+ protected static boolean contains(String value, int start, int length,
+ String[] criteria) {
+ boolean result = false;
+ if (start >= 0 && start + length <= value.length()) {
+ String target = value.substring(start, start + length);
+
+ for (String element : criteria) {
+ if (target.equals(element)) {
+ result = true;
+ break;
+ }
+ }
+ }
+ return result;
+ }
+
+ //-- BEGIN INNER CLASSES --//
+
+ /**
+ * Inner class for storing results, since there is the optional alternate
+ * encoding.
+ */
+ public class DoubleMetaphoneResult {
+
+ private final StringBuffer primary = new StringBuffer(getMaxCodeLen());
+ private final StringBuffer alternate = new StringBuffer(getMaxCodeLen());
+ private final int maxLength;
+
+ public DoubleMetaphoneResult(int maxLength) {
+ this.maxLength = maxLength;
+ }
+
+ public void append(char value) {
+ appendPrimary(value);
+ appendAlternate(value);
+ }
+
+ public void append(char primary, char alternate) {
+ appendPrimary(primary);
+ appendAlternate(alternate);
+ }
+
+ public void appendPrimary(char value) {
+ if (this.primary.length() < this.maxLength) {
+ this.primary.append(value);
+ }
+ }
+
+ public void appendAlternate(char value) {
+ if (this.alternate.length() < this.maxLength) {
+ this.alternate.append(value);
+ }
+ }
+
+ public void append(String value) {
+ appendPrimary(value);
+ appendAlternate(value);
+ }
+
+ public void append(String primary, String alternate) {
+ appendPrimary(primary);
+ appendAlternate(alternate);
+ }
+
+ public void appendPrimary(String value) {
+ int addChars = this.maxLength - this.primary.length();
+ if (value.length() <= addChars) {
+ this.primary.append(value);
+ } else {
+ this.primary.append(value.substring(0, addChars));
+ }
+ }
+
+ public void appendAlternate(String value) {
+ int addChars = this.maxLength - this.alternate.length();
+ if (value.length() <= addChars) {
+ this.alternate.append(value);
+ } else {
+ this.alternate.append(value.substring(0, addChars));
+ }
+ }
+
+ public String getPrimary() {
+ return this.primary.toString();
+ }
+
+ public String getAlternate() {
+ return this.alternate.toString();
+ }
+
+ public boolean isComplete() {
+ return this.primary.length() >= this.maxLength &&
+ this.alternate.length() >= this.maxLength;
+ }
+ }
+}
diff --git a/source/net/yacy/cora/language/phonetic/Metaphone.java b/source/net/yacy/cora/language/phonetic/Metaphone.java
new file mode 100644
index 000000000..65f4f0716
--- /dev/null
+++ b/source/net/yacy/cora/language/phonetic/Metaphone.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.yacy.cora.language.phonetic;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Metaphone value.
+ * + * Initial Java implementation by William B. Brogden. December, 1997. + * Permission given by wbrogden for code to be used anywhere. + *
+ *+ * Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p + * 39. + *
+ *+ * Note, that this does not match the algorithm that ships with PHP, or the algorithm + * found in the Perl Text:Metaphone-1.96. + * They have had undocumented changes from the originally published algorithm. + * For more information, see CODEC-57. + *
+ * + * @author Apache Software Foundation + * @version $Id: Metaphone.java 1157192 2011-08-12 17:27:38Z ggregory $ + */ +public class Metaphone implements StringEncoder { + + /** + * Five values in the English language + */ + private static final String VOWELS = "AEIOU" ; + + /** + * Variable used in Metaphone algorithm + */ + private static final String FRONTV = "EIY" ; + + /** + * Variable used in Metaphone algorithm + */ + private static final String VARSON = "CSPTG" ; + + /** + * The max code length for metaphone is 4 + */ + private int maxCodeLen = 4 ; + + /** + * Creates an instance of the Metaphone encoder + */ + public Metaphone() { + super(); + } + + /** + * Find the metaphone value of a String. This is similar to the + * soundex algorithm, but better at finding similar sounding words. + * All input is converted to upper case. + * Limitations: Input format is expected to be a single ASCII word + * with only characters in the A - Z range, no punctuation or numbers. + * + * @param txt String to find the metaphone code for + * @return A metaphone code corresponding to the String supplied + */ + public String metaphone(String txt) { + boolean hard = false ; + if ((txt == null) || (txt.length() == 0)) { + return "" ; + } + // single character is itself + if (txt.length() == 1) { + return txt.toUpperCase(java.util.Locale.ENGLISH) ; + } + + char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ; + + StringBuffer local = new StringBuffer(40); // manipulate + StringBuffer code = new StringBuffer(10) ; // output + // handle initial 2 characters exceptions + switch(inwd[0]) { + case 'K' : + case 'G' : + case 'P' : /* looking for KN, etc*/ + if (inwd[1] == 'N') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'A': /* looking for AE */ + if (inwd[1] == 'E') { + local.append(inwd, 1, inwd.length - 1); + } else { + local.append(inwd); + } + break; + case 'W' : /* looking for WR or WH */ + if (inwd[1] == 'R') { // WR -> R + local.append(inwd, 1, inwd.length - 1); + break ; + } + if (inwd[1] == 'H') { + local.append(inwd, 1, inwd.length - 1); + local.setCharAt(0, 'W'); // WH -> W + } else { + local.append(inwd); + } + break; + case 'X' : /* initial X becomes S */ + inwd[0] = 'S'; + local.append(inwd); + break ; + default : + local.append(inwd); + } // now local has working string with initials fixed + + int wdsz = local.length(); + int n = 0 ; + + while ((code.length() < this.getMaxCodeLen()) && + (n < wdsz) ) { // max code size of 4 works well + char symb = local.charAt(n) ; + // remove duplicate letters except C + if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { + n++ ; + } else { // not dup + switch(symb) { + case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : + if (n == 0) { + code.append(symb); + } + break ; // only use vowel if leading char + case 'B' : + if ( isPreviousChar(local, n, 'M') && + isLastChar(wdsz, n) ) { // B is silent if word ends in MB + break; + } + code.append(symb); + break; + case 'C' : // lots of C special cases + /* discard if SCI, SCE or SCY */ + if ( isPreviousChar(local, n, 'S') && + !isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { + break; + } + if (regionMatch(local, n, "CIA")) { // "CIA" -> X + code.append('X'); + break; + } + if (!isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) { + code.append('S'); + break; // CI,CE,CY -> S + } + if (isPreviousChar(local, n, 'S') && + isNextChar(local, n, 'H') ) { // SCH->sk + code.append('K') ; + break ; + } + if (isNextChar(local, n, 'H')) { // detect CH + if ((n == 0) && + (wdsz >= 3) && + isVowel(local,2) ) { // CH consonant -> K consonant + code.append('K'); + } else { + code.append('X'); // CHvowel -> X + } + } else { + code.append('K'); + } + break ; + case 'D' : + if (!isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'G') && + (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J + code.append('J'); n += 2 ; + } else { + code.append('T'); + } + break ; + case 'G' : // GH silent at end or before consonant + if (isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'H')) { + break; + } + if (!isLastChar(wdsz, n + 1) && + isNextChar(local,n,'H') && + !isVowel(local,n+2)) { + break; + } + if ((n > 0) && + ( regionMatch(local, n, "GN") || + regionMatch(local, n, "GNED") ) ) { + break; // silent G + } + if (isPreviousChar(local, n, 'G')) { + // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true + hard = true ; + } else { + hard = false ; + } + if (!isLastChar(wdsz, n) && + (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && + (!hard)) { + code.append('J'); + } else { + code.append('K'); + } + break ; + case 'H': + if (isLastChar(wdsz, n)) { + break ; // terminal H + } + if ((n > 0) && + (VARSON.indexOf(local.charAt(n - 1)) >= 0)) { + break; + } + if (isVowel(local,n+1)) { + code.append('H'); // Hvowel + } + break; + case 'F': + case 'J' : + case 'L' : + case 'M': + case 'N' : + case 'R' : + code.append(symb); + break; + case 'K' : + if (n > 0) { // not initial + if (!isPreviousChar(local, n, 'C')) { + code.append(symb); + } + } else { + code.append(symb); // initial K + } + break ; + case 'P' : + if (isNextChar(local,n,'H')) { + // PH -> F + code.append('F'); + } else { + code.append(symb); + } + break ; + case 'Q' : + code.append('K'); + break; + case 'S' : + if (regionMatch(local,n,"SH") || + regionMatch(local,n,"SIO") || + regionMatch(local,n,"SIA")) { + code.append('X'); + } else { + code.append('S'); + } + break; + case 'T' : + if (regionMatch(local,n,"TIA") || + regionMatch(local,n,"TIO")) { + code.append('X'); + break; + } + if (regionMatch(local,n,"TCH")) { + // Silent if in "TCH" + break; + } + // substitute numeral 0 for TH (resembles theta after all) + if (regionMatch(local,n,"TH")) { + code.append('0'); + } else { + code.append('T'); + } + break ; + case 'V' : + code.append('F'); break ; + case 'W' : case 'Y' : // silent if not followed by vowel + if (!isLastChar(wdsz,n) && + isVowel(local,n+1)) { + code.append(symb); + } + break ; + case 'X' : + code.append('K'); code.append('S'); + break ; + case 'Z' : + code.append('S'); break ; + } // end switch + n++ ; + } // end else from symb != 'C' + if (code.length() > this.getMaxCodeLen()) { + code.setLength(this.getMaxCodeLen()); + } + } + return code.toString(); + } + + private boolean isVowel(StringBuffer string, int index) { + return VOWELS.indexOf(string.charAt(index)) >= 0; + } + + private boolean isPreviousChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index > 0 && + index < string.length() ) { + matches = string.charAt(index - 1) == c; + } + return matches; + } + + private boolean isNextChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index >= 0 && + index < string.length() - 1 ) { + matches = string.charAt(index + 1) == c; + } + return matches; + } + + private boolean regionMatch(StringBuffer string, int index, String test) { + boolean matches = false; + if( index >= 0 && + (index + test.length() - 1) < string.length() ) { + String substring = string.substring( index, index + test.length()); + matches = substring.equals( test ); + } + return matches; + } + + private boolean isLastChar(int wdsz, int n) { + return n + 1 == wdsz; + } + + + /** + * Encodes an Object using the metaphone algorithm. This method + * is provided in order to satisfy the requirements of the + * Encoder interface, and will throw an EncoderException if the + * supplied object is not of type java.lang.String. + * + * @param pObject Object to encode + * @return An object (or type java.lang.String) containing the + * metaphone code which corresponds to the String supplied. + * @throws EncoderException if the parameter supplied is not + * of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); + } + return metaphone((String) pObject); + } + + /** + * Encodes a String using the Metaphone algorithm. + * + * @param pString String object to encode + * @return The metaphone code corresponding to the String supplied + */ + public String encode(String pString) { + return metaphone(pString); + } + + /** + * Tests is the metaphones of two strings are identical. + * + * @param str1 First of two strings to compare + * @param str2 Second of two strings to compare + * @returntrue
if the metaphones of these strings are identical,
+ * false
otherwise.
+ */
+ public boolean isMetaphoneEqual(String str1, String str2) {
+ return metaphone(str1).equals(metaphone(str2));
+ }
+
+ /**
+ * Returns the maxCodeLen.
+ * @return int
+ */
+ public int getMaxCodeLen() { return this.maxCodeLen; }
+
+ /**
+ * Sets the maxCodeLen.
+ * @param maxCodeLen The maxCodeLen to set
+ */
+ public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
+
+}
diff --git a/source/net/yacy/cora/language/phonetic/Phonetic.java b/source/net/yacy/cora/language/phonetic/Phonetic.java
new file mode 100644
index 000000000..f59bbd137
--- /dev/null
+++ b/source/net/yacy/cora/language/phonetic/Phonetic.java
@@ -0,0 +1,73 @@
+/**
+ * Phonetic
+ * Copyright 201 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
+ * First released 13.12.2011 at http://yacy.net
+ *
+ * $LastChangedDate$
+ * $LastChangedRevision$
+ * $LastChangedBy$
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see 0
for a letter position
+ * means do not encode.
+ * + * (This constant is provided as both an implementation convenience and to allow Javadoc to pick + * up the value for the constant values page.) + *
+ * + * @see #US_ENGLISH_MAPPING + */ + public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; + + /** + * This is a default mapping of the 26 letters used in US English. A value of0
for a letter position
+ * means do not encode.
+ *
+ * @see Soundex#Soundex(char[])
+ */
+ private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
+
+ /**
+ * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
+ *
+ * @see #US_ENGLISH_MAPPING
+ */
+ public static final Soundex US_ENGLISH = new Soundex();
+
+ /**
+ * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
+ * letter is mapped. This implementation contains a default map for US_ENGLISH
+ */
+ private final char[] soundexMapping;
+
+ /**
+ * Creates an instance using US_ENGLISH_MAPPING
+ *
+ * @see Soundex#Soundex(char[])
+ * @see Soundex#US_ENGLISH_MAPPING
+ */
+ public Soundex() {
+ this.soundexMapping = US_ENGLISH_MAPPING;
+ }
+
+ /**
+ * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
+ * mapping for a non-Western character set.
+ *
+ * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
+ * letter is mapped. This implementation contains a default map for US_ENGLISH
+ *
+ * @param mapping
+ * Mapping array to use when finding the corresponding code for a given character
+ */
+ public Soundex(char[] mapping) {
+ this.soundexMapping = new char[mapping.length];
+ System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
+ }
+
+ /**
+ * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
+ * and/or possibly provide an internationalized mapping for a non-Western character set.
+ *
+ * @param mapping
+ * Mapping string to use when finding the corresponding code for a given character
+ * @since 1.4
+ */
+ public Soundex(String mapping) {
+ this.soundexMapping = mapping.toCharArray();
+ }
+
+ /**
+ * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
+ * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
+ * identical values.
+ *
+ * @param s1
+ * A String that will be encoded and compared.
+ * @param s2
+ * A String that will be encoded and compared.
+ * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
+ *
+ * @see SoundexUtils#difference(StringEncoder,String,String)
+ * @see MS
+ * T-SQL DIFFERENCE
+ *
+ * @throws EncoderException
+ * if an error occurs encoding one of the strings
+ * @since 1.3
+ */
+ public int difference(String s1, String s2) throws EncoderException {
+ return difference(this, s1, s2);
+ }
+
+ /**
+ * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
+ * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
+ *
+ * @param pObject
+ * Object to encode
+ * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
+ * supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public Object encode(Object pObject) throws EncoderException {
+ if (!(pObject instanceof String)) {
+ throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
+ }
+ return soundex((String) pObject);
+ }
+
+ /**
+ * Encodes a String using the soundex algorithm.
+ *
+ * @param pString
+ * A String object to encode
+ * @return A Soundex code corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public String encode(String pString) {
+ return soundex(pString);
+ }
+
+ /**
+ * Used internally by the SoundEx algorithm.
+ *
+ * Consonants from the same code group separated by W or H are treated as one.
+ *
+ * @param str
+ * the cleaned working string to encode (in upper case).
+ * @param index
+ * the character position to encode
+ * @return Mapping code for a particular character
+ * @throws IllegalArgumentException
+ * if the character is not mapped
+ */
+ private char getMappingCode(String str, int index) {
+ // map() throws IllegalArgumentException
+ char mappedChar = this.map(str.charAt(index));
+ // HW rule check
+ if (index > 1 && mappedChar != '0') {
+ char hwChar = str.charAt(index - 1);
+ if ('H' == hwChar || 'W' == hwChar) {
+ char preHWChar = str.charAt(index - 2);
+ char firstCode = this.map(preHWChar);
+ if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
+ return 0;
+ }
+ }
+ }
+ return mappedChar;
+ }
+
+ /**
+ * Returns the soundex mapping.
+ *
+ * @return soundexMapping.
+ */
+ private char[] getSoundexMapping() {
+ return this.soundexMapping;
+ }
+
+ /**
+ * Maps the given upper-case character to its Soundex code.
+ *
+ * @param ch
+ * An upper-case character.
+ * @return A Soundex code.
+ * @throws IllegalArgumentException
+ * Thrown if ch
is not mapped.
+ */
+ private char map(char ch) {
+ int index = ch - 'A';
+ if (index < 0 || index >= this.getSoundexMapping().length) {
+ throw new IllegalArgumentException("The character is not mapped: " + ch);
+ }
+ return this.getSoundexMapping()[index];
+ }
+
+ /**
+ * Retrieves the Soundex code for a given String object.
+ *
+ * @param str
+ * String to encode using the Soundex algorithm
+ * @return A soundex code for the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public String soundex(String str) {
+ if (str == null) {
+ return null;
+ }
+ str = clean(str);
+ if (str.length() == 0) {
+ return str;
+ }
+ char out[] = {'0', '0', '0', '0'};
+ char last, mapped;
+ int incount = 1, count = 1;
+ out[0] = str.charAt(0);
+ // getMappingCode() throws IllegalArgumentException
+ last = getMappingCode(str, 0);
+ while ((incount < str.length()) && (count < out.length)) {
+ mapped = getMappingCode(str, incount++);
+ if (mapped != 0) {
+ if ((mapped != '0') && (mapped != last)) {
+ out[count++] = mapped;
+ }
+ last = mapped;
+ }
+ }
+ return new String(out);
+ }
+
+
+ /**
+ * Cleans up the input string before Soundex processing by only returning
+ * upper case letters.
+ *
+ * @param str
+ * The String to clean.
+ * @return A clean String.
+ */
+ static String clean(String str) {
+ if (str == null || str.length() == 0) {
+ return str;
+ }
+ int len = str.length();
+ char[] chars = new char[len];
+ int count = 0;
+ for (int i = 0; i < len; i++) {
+ if (Character.isLetter(str.charAt(i))) {
+ chars[count++] = str.charAt(i);
+ }
+ }
+ if (count == len) {
+ return str.toUpperCase(java.util.Locale.ENGLISH);
+ }
+ return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
+ }
+
+ /**
+ * Encodes the Strings and returns the number of characters in the two
+ * encoded Strings that are the same.
+ *