mirror of
https://github.com/badlogic/heissepreise.git
synced 2024-06-20 15:35:49 +02:00
Added BM25 search index implementation. Run node site/js/bm25.js
for testing.
This commit is contained in:
parent
c7537c341e
commit
b6a9ec80b5
20
package-lock.json
generated
20
package-lock.json
generated
|
@ -18,8 +18,8 @@
|
|||
"compression": "^1.7.4",
|
||||
"express": "^4.18.2",
|
||||
"moment": "^2.29.4",
|
||||
"n-readlines": "^1.0.1",
|
||||
"node-html-parser": "^6.1.5"
|
||||
"node-html-parser": "^6.1.5",
|
||||
"readline-sync": "^1.4.10"
|
||||
},
|
||||
"devDependencies": {
|
||||
"autoprefixer": "^10.4.14",
|
||||
|
@ -2499,14 +2499,6 @@
|
|||
"thenify-all": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/n-readlines": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/n-readlines/-/n-readlines-1.0.1.tgz",
|
||||
"integrity": "sha512-z4SyAIVgMy7CkgsoNw7YVz40v0g4+WWvvqy8+ZdHrCtgevcEO758WQyrYcw3XPxcLxF+//RszTz/rO48nzD0wQ==",
|
||||
"engines": {
|
||||
"node": ">=6.x.x"
|
||||
}
|
||||
},
|
||||
"node_modules/nanoid": {
|
||||
"version": "3.3.6",
|
||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
|
||||
|
@ -3189,6 +3181,14 @@
|
|||
"node": ">=8.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/readline-sync": {
|
||||
"version": "1.4.10",
|
||||
"resolved": "https://registry.npmjs.org/readline-sync/-/readline-sync-1.4.10.tgz",
|
||||
"integrity": "sha512-gNva8/6UAe8QYepIQH/jQ2qn91Qj0B9sYjMBBs3QOB8F2CXcKgLxQaJRP76sWVRQt+QU+8fAkCbCvjjMFu7Ycw==",
|
||||
"engines": {
|
||||
"node": ">= 0.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/regenerator-runtime": {
|
||||
"version": "0.13.11",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
|
||||
|
|
|
@ -29,8 +29,8 @@
|
|||
"compression": "^1.7.4",
|
||||
"express": "^4.18.2",
|
||||
"moment": "^2.29.4",
|
||||
"n-readlines": "^1.0.1",
|
||||
"node-html-parser": "^6.1.5"
|
||||
"node-html-parser": "^6.1.5",
|
||||
"readline-sync": "^1.4.10"
|
||||
},
|
||||
"devDependencies": {
|
||||
"autoprefixer": "^10.4.14",
|
||||
|
|
130
site/js/bm25.js
Normal file
130
site/js/bm25.js
Normal file
|
@ -0,0 +1,130 @@
|
|||
const { readJSON, writeJSON } = require("../../analysis");
|
||||
const { stem } = require("../js/stem");
|
||||
const { deltaTime } = require("./misc");
|
||||
|
||||
const whitespaceRegex = /[^\p{Letter}\s]|_/gu;
|
||||
const whitespaceRegex2 = /\s+/;
|
||||
const isNumber = /^\d+\.\d+$/;
|
||||
|
||||
exports.tokenize = (str) => {
|
||||
const name = str.toLowerCase().replace(whitespaceRegex, " ").replace("erdapfel", "kartoffel").replace("erdäpfel", "kartoffeln");
|
||||
return name
|
||||
.split(whitespaceRegex2)
|
||||
.map((token) => stem(token))
|
||||
.filter((token) => !isNumber.test(token) && token.length > 0);
|
||||
};
|
||||
|
||||
exports.index = (items) => {
|
||||
const start = performance.now();
|
||||
|
||||
const index = {
|
||||
docs: [],
|
||||
totalDocLength: 0,
|
||||
averageDocLength: 0,
|
||||
words: {},
|
||||
k1: 1.3,
|
||||
b: 0.75,
|
||||
};
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i];
|
||||
const tokens = this.tokenize(item.name);
|
||||
const doc = {
|
||||
id: item.store + item.id,
|
||||
body: item.name,
|
||||
words: {},
|
||||
wordCount: tokens.length,
|
||||
};
|
||||
const words = doc.words;
|
||||
for (const token of tokens) {
|
||||
let word = words[token];
|
||||
if (!word) {
|
||||
word = words[token] = {
|
||||
count: 0,
|
||||
frequency: 0,
|
||||
};
|
||||
}
|
||||
word.count++;
|
||||
}
|
||||
|
||||
for (const key in words) {
|
||||
const word = words[key];
|
||||
word.frequency = word.count / doc.wordCount;
|
||||
|
||||
let indexWord = index.words[key];
|
||||
if (!indexWord) {
|
||||
indexWord = index.words[key] = {
|
||||
docs: [],
|
||||
idf: 0,
|
||||
};
|
||||
}
|
||||
indexWord.docs.push(i);
|
||||
}
|
||||
|
||||
index.docs.push(doc);
|
||||
index.totalDocLength += tokens.length;
|
||||
index.averageDocLength = index.totalDocLength / index.docs.length;
|
||||
}
|
||||
|
||||
var keys = Object.keys(index.words);
|
||||
for (const key of keys) {
|
||||
var num = index.docs.length - index.words[key].docs.length + 0.5;
|
||||
var denom = index.words[key].docs.length + 0.5;
|
||||
index.words[key].idf = Math.max(Math.log10(num / denom), 0.01);
|
||||
}
|
||||
|
||||
// console.log(index);
|
||||
console.log(`Words: ${Object.keys(index.words).length}`);
|
||||
console.log(`Building index took: ${deltaTime(start)}`);
|
||||
return index;
|
||||
};
|
||||
|
||||
exports.search = (index, query) => {
|
||||
const tokens = this.tokenize(query);
|
||||
const results = [];
|
||||
const candidateDocs = new Set();
|
||||
for (const token of tokens) {
|
||||
const word = index.words[token];
|
||||
if (word) {
|
||||
for (const doc of word.docs) candidateDocs.add(doc);
|
||||
}
|
||||
}
|
||||
|
||||
for (const docId of candidateDocs) {
|
||||
const doc = index.docs[docId];
|
||||
doc.score = 0;
|
||||
|
||||
for (const token of tokens) {
|
||||
if (!index.words[token]) continue;
|
||||
if (!doc.words[token]) continue;
|
||||
const idf = index.words[token].idf;
|
||||
const num = doc.words[token].count * (index.k1 + 1);
|
||||
const denom = doc.words[token].count + index.k1 * (1 - index.b + (index.b * doc.wordCount) / index.averageDocLength);
|
||||
doc.score += (idf * num) / denom;
|
||||
}
|
||||
|
||||
if (!isNaN(doc.score) && doc.score > 0) {
|
||||
results.push(doc);
|
||||
}
|
||||
}
|
||||
results.sort((a, b) => b.score - a.score);
|
||||
return results;
|
||||
};
|
||||
|
||||
if (require.main === module) {
|
||||
let items = readJSON("data/latest-canonical.json.br");
|
||||
if (items.items) items = items.items;
|
||||
console.log("Indexing ...");
|
||||
const index = this.index(items);
|
||||
|
||||
const readline = require("readline-sync");
|
||||
while (true) {
|
||||
const query = readline.question("> ");
|
||||
const result = this.search(index, query);
|
||||
for (let i = 0; i < Math.min(result.length, 20); i++) {
|
||||
const doc = result[i];
|
||||
console.log(`${doc.score} ${doc.body}`);
|
||||
}
|
||||
console.log(`${result.length} results`);
|
||||
}
|
||||
}
|
|
@ -117,7 +117,7 @@ exports.parseNumber = (value, defaultValue) => {
|
|||
}
|
||||
};
|
||||
|
||||
exports.queryItems = (query, items, exactWord) => {
|
||||
exports.queryItemsAlasql = (query, items) => {
|
||||
alasql.fn.hasPriceChange = (priceHistory, date, endDate) => {
|
||||
if (!endDate) return priceHistory.some((price) => price.date == date);
|
||||
else return priceHistory.some((price) => price.date >= date && price.date <= endDate);
|
||||
|
@ -127,16 +127,16 @@ exports.queryItems = (query, items, exactWord) => {
|
|||
return priceHistory.some((price) => price.date.indexOf(date) >= 0);
|
||||
};
|
||||
|
||||
if (query.charAt(0) == "!") {
|
||||
query = query.substring(1);
|
||||
try {
|
||||
return alasql("select * from ? where " + query, [items]);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
return [];
|
||||
}
|
||||
query = query.substring(1);
|
||||
try {
|
||||
return alasql("select * from ? where " + query, [items]);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
return [];
|
||||
}
|
||||
};
|
||||
|
||||
exports.queryItems = (query, items, exactWord) => {
|
||||
let tokens = query.split(/\s+/).map((token) => token.toLowerCase().replace(",", "."));
|
||||
|
||||
// Find quantity/unit query
|
||||
|
|
|
@ -211,7 +211,11 @@ class ItemsFilter extends View {
|
|||
}
|
||||
|
||||
if (query.length > 0) {
|
||||
filteredItems = queryItems(query, filteredItems, elements.exact.checked);
|
||||
if (query.charAt(0) == "!") {
|
||||
filteredItems = queryItemsAlasql(query, filteredItems);
|
||||
} else {
|
||||
filteredItems = queryItems(query, filteredItems, elements.exact.checked);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.model.lastQuery && this.model.lastQuery != query && !this._noChartClear) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user