Add German compound word decomposition. Doesn't help a lot, need more context words apart from product name.

2024-06-20 15:35:49 +02:00 · 2023-06-17 20:29:43 +02:00 · 2023-06-17 20:29:43 +02:00 · ad01cc0658
commit ad01cc0658
parent b6a9ec80b5
4 changed files with 14582 additions and 9 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -17,6 +17,7 @@
        "chokidar": "^3.5.3",
        "compression": "^1.7.4",
        "express": "^4.18.2",
+        "hyphen": "^1.6.6",
        "moment": "^2.29.4",
        "node-html-parser": "^6.1.5",
        "readline-sync": "^1.4.10"
@ -2130,6 +2131,11 @@
        "url": "https://github.com/sponsors/typicode"
      }
    },
+    "node_modules/hyphen": {
+      "version": "1.6.6",
+      "resolved": "https://registry.npmjs.org/hyphen/-/hyphen-1.6.6.tgz",
+      "integrity": "sha512-XtqmnT+b9n5MX+MsqluFAVTIenbtC25iskW0Z+jLd+awfhA+ZbWKWQMIvLJccGoa2bM1R6juWJ27cZxIFOmkWw=="
+    },
    "node_modules/iconv-lite": {
      "version": "0.4.24",
      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
--- a/package.json
+++ b/package.json
@ -28,6 +28,7 @@
    "chokidar": "^3.5.3",
    "compression": "^1.7.4",
    "express": "^4.18.2",
+    "hyphen": "^1.6.6",
    "moment": "^2.29.4",
    "node-html-parser": "^6.1.5",
    "readline-sync": "^1.4.10"
--- a/site/data/dictionary-de.txt
+++ b/site/data/dictionary-de.txt
--- a/site/js/search-index.js
+++ b/site/js/search-index.js
@ -1,17 +1,46 @@
-const { readJSON, writeJSON } = require("../../analysis");
-const { stem } = require("../js/stem");
+const { start } = require("repl");
 const { deltaTime } = require("./misc");
+const { stem } = require("./stem");
+const { hyphenateSync } = require("hyphen/de");

 const whitespaceRegex = /[^\p{Letter}\s]|_/gu;
 const whitespaceRegex2 = /\s+/;
 const isNumber = /^\d+\.\d+$/;

+let dictionary = new Set();
+
 exports.tokenize = (str) => {
    const name = str.toLowerCase().replace(whitespaceRegex, " ").replace("erdapfel", "kartoffel").replace("erdäpfel", "kartoffeln");
    return name
        .split(whitespaceRegex2)
-        .map((token) => stem(token))
-        .filter((token) => !isNumber.test(token) && token.length > 0);
+        .filter((token) => !isNumber.test(token) && token.length > 0)
+        .flatMap((token) => {
+            const hyphens = hyphenateSync(token, { hyphenChar: "*" }).split("*");
+            const newTokens = [];
+
+            let i = 0;
+            while (i < hyphens.length) {
+                let longestMatch = null;
+                let nextIndex = -1;
+                let match = "";
+                for (let j = i; j < hyphens.length; j++) {
+                    match += hyphens[j];
+                    if (dictionary.has(match)) {
+                        longestMatch = match;
+                        nextIndex = j + 1;
+                    }
+                }
+                if (!longestMatch) {
+                    i++;
+                } else {
+                    newTokens.push(longestMatch);
+                    i = nextIndex;
+                }
+            }
+
+            return [token, ...newTokens];
+        })
+        .map((token) => stem(token));
 };

 exports.index = (items) => {
@ -73,9 +102,6 @@ exports.index = (items) => {
        index.words[key].idf = Math.max(Math.log10(num / denom), 0.01);
    }

-    // console.log(index);
-    console.log(`Words: ${Object.keys(index.words).length}`);
-    console.log(`Building index took: ${deltaTime(start)}`);
    return index;
 };

@ -112,15 +138,29 @@ exports.search = (index, query) => {
 };

 if (require.main === module) {
+    const fs = require("fs");
+    const { readJSON } = require("../../analysis");
+
    let items = readJSON("data/latest-canonical.json.br");
    if (items.items) items = items.items;
+
+    const dictionaryItems = fs
+        .readFileSync("site/data/dictionary-de.txt")
+        .toString()
+        .split("\n")
+        .filter((line) => !line.includes("#"));
+    dictionary = new Set(dictionaryItems);
+
    console.log("Indexing ...");
-    const index = this.index(items);
+    const start = performance.now();
+    const index = exports.index(items);
+    console.log(`Building index took: ${deltaTime(start)}`);
+    console.log(`Words: ${Object.keys(index.words).length}`);

    const readline = require("readline-sync");
    while (true) {
        const query = readline.question("> ");
-        const result = this.search(index, query);
+        const result = exports.search(index, query);
        for (let i = 0; i < Math.min(result.length, 20); i++) {
            const doc = result[i];
            console.log(`${doc.score} ${doc.body}`);