mirror of
https://github.com/badlogic/heissepreise.git
synced 2024-09-22 00:00:59 +02:00
Add German compound word decomposition. Doesn't help a lot, need more context words apart from product name.
This commit is contained in:
parent
b6a9ec80b5
commit
ad01cc0658
6
package-lock.json
generated
6
package-lock.json
generated
|
@ -17,6 +17,7 @@
|
||||||
"chokidar": "^3.5.3",
|
"chokidar": "^3.5.3",
|
||||||
"compression": "^1.7.4",
|
"compression": "^1.7.4",
|
||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
|
"hyphen": "^1.6.6",
|
||||||
"moment": "^2.29.4",
|
"moment": "^2.29.4",
|
||||||
"node-html-parser": "^6.1.5",
|
"node-html-parser": "^6.1.5",
|
||||||
"readline-sync": "^1.4.10"
|
"readline-sync": "^1.4.10"
|
||||||
|
@ -2130,6 +2131,11 @@
|
||||||
"url": "https://github.com/sponsors/typicode"
|
"url": "https://github.com/sponsors/typicode"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/hyphen": {
|
||||||
|
"version": "1.6.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/hyphen/-/hyphen-1.6.6.tgz",
|
||||||
|
"integrity": "sha512-XtqmnT+b9n5MX+MsqluFAVTIenbtC25iskW0Z+jLd+awfhA+ZbWKWQMIvLJccGoa2bM1R6juWJ27cZxIFOmkWw=="
|
||||||
|
},
|
||||||
"node_modules/iconv-lite": {
|
"node_modules/iconv-lite": {
|
||||||
"version": "0.4.24",
|
"version": "0.4.24",
|
||||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
"chokidar": "^3.5.3",
|
"chokidar": "^3.5.3",
|
||||||
"compression": "^1.7.4",
|
"compression": "^1.7.4",
|
||||||
"express": "^4.18.2",
|
"express": "^4.18.2",
|
||||||
|
"hyphen": "^1.6.6",
|
||||||
"moment": "^2.29.4",
|
"moment": "^2.29.4",
|
||||||
"node-html-parser": "^6.1.5",
|
"node-html-parser": "^6.1.5",
|
||||||
"readline-sync": "^1.4.10"
|
"readline-sync": "^1.4.10"
|
||||||
|
|
14526
site/data/dictionary-de.txt
Normal file
14526
site/data/dictionary-de.txt
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -1,17 +1,46 @@
|
||||||
const { readJSON, writeJSON } = require("../../analysis");
|
const { start } = require("repl");
|
||||||
const { stem } = require("../js/stem");
|
|
||||||
const { deltaTime } = require("./misc");
|
const { deltaTime } = require("./misc");
|
||||||
|
const { stem } = require("./stem");
|
||||||
|
const { hyphenateSync } = require("hyphen/de");
|
||||||
|
|
||||||
const whitespaceRegex = /[^\p{Letter}\s]|_/gu;
|
const whitespaceRegex = /[^\p{Letter}\s]|_/gu;
|
||||||
const whitespaceRegex2 = /\s+/;
|
const whitespaceRegex2 = /\s+/;
|
||||||
const isNumber = /^\d+\.\d+$/;
|
const isNumber = /^\d+\.\d+$/;
|
||||||
|
|
||||||
|
let dictionary = new Set();
|
||||||
|
|
||||||
exports.tokenize = (str) => {
|
exports.tokenize = (str) => {
|
||||||
const name = str.toLowerCase().replace(whitespaceRegex, " ").replace("erdapfel", "kartoffel").replace("erdäpfel", "kartoffeln");
|
const name = str.toLowerCase().replace(whitespaceRegex, " ").replace("erdapfel", "kartoffel").replace("erdäpfel", "kartoffeln");
|
||||||
return name
|
return name
|
||||||
.split(whitespaceRegex2)
|
.split(whitespaceRegex2)
|
||||||
.map((token) => stem(token))
|
.filter((token) => !isNumber.test(token) && token.length > 0)
|
||||||
.filter((token) => !isNumber.test(token) && token.length > 0);
|
.flatMap((token) => {
|
||||||
|
const hyphens = hyphenateSync(token, { hyphenChar: "*" }).split("*");
|
||||||
|
const newTokens = [];
|
||||||
|
|
||||||
|
let i = 0;
|
||||||
|
while (i < hyphens.length) {
|
||||||
|
let longestMatch = null;
|
||||||
|
let nextIndex = -1;
|
||||||
|
let match = "";
|
||||||
|
for (let j = i; j < hyphens.length; j++) {
|
||||||
|
match += hyphens[j];
|
||||||
|
if (dictionary.has(match)) {
|
||||||
|
longestMatch = match;
|
||||||
|
nextIndex = j + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!longestMatch) {
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
newTokens.push(longestMatch);
|
||||||
|
i = nextIndex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [token, ...newTokens];
|
||||||
|
})
|
||||||
|
.map((token) => stem(token));
|
||||||
};
|
};
|
||||||
|
|
||||||
exports.index = (items) => {
|
exports.index = (items) => {
|
||||||
|
@ -73,9 +102,6 @@ exports.index = (items) => {
|
||||||
index.words[key].idf = Math.max(Math.log10(num / denom), 0.01);
|
index.words[key].idf = Math.max(Math.log10(num / denom), 0.01);
|
||||||
}
|
}
|
||||||
|
|
||||||
// console.log(index);
|
|
||||||
console.log(`Words: ${Object.keys(index.words).length}`);
|
|
||||||
console.log(`Building index took: ${deltaTime(start)}`);
|
|
||||||
return index;
|
return index;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -112,15 +138,29 @@ exports.search = (index, query) => {
|
||||||
};
|
};
|
||||||
|
|
||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
|
const fs = require("fs");
|
||||||
|
const { readJSON } = require("../../analysis");
|
||||||
|
|
||||||
let items = readJSON("data/latest-canonical.json.br");
|
let items = readJSON("data/latest-canonical.json.br");
|
||||||
if (items.items) items = items.items;
|
if (items.items) items = items.items;
|
||||||
|
|
||||||
|
const dictionaryItems = fs
|
||||||
|
.readFileSync("site/data/dictionary-de.txt")
|
||||||
|
.toString()
|
||||||
|
.split("\n")
|
||||||
|
.filter((line) => !line.includes("#"));
|
||||||
|
dictionary = new Set(dictionaryItems);
|
||||||
|
|
||||||
console.log("Indexing ...");
|
console.log("Indexing ...");
|
||||||
const index = this.index(items);
|
const start = performance.now();
|
||||||
|
const index = exports.index(items);
|
||||||
|
console.log(`Building index took: ${deltaTime(start)}`);
|
||||||
|
console.log(`Words: ${Object.keys(index.words).length}`);
|
||||||
|
|
||||||
const readline = require("readline-sync");
|
const readline = require("readline-sync");
|
||||||
while (true) {
|
while (true) {
|
||||||
const query = readline.question("> ");
|
const query = readline.question("> ");
|
||||||
const result = this.search(index, query);
|
const result = exports.search(index, query);
|
||||||
for (let i = 0; i < Math.min(result.length, 20); i++) {
|
for (let i = 0; i < Math.min(result.length, 20); i++) {
|
||||||
const doc = result[i];
|
const doc = result[i];
|
||||||
console.log(`${doc.score} ${doc.body}`);
|
console.log(`${doc.score} ${doc.body}`);
|
Loading…
Reference in New Issue
Block a user