Added BM25 search index implementation. Run node site/js/bm25.js for testing.

This commit is contained in:
Mario Zechner 2023-06-17 14:40:09 +02:00
parent c7537c341e
commit b6a9ec80b5
5 changed files with 156 additions and 22 deletions

20
package-lock.json generated
View File

@ -18,8 +18,8 @@
"compression": "^1.7.4",
"express": "^4.18.2",
"moment": "^2.29.4",
"n-readlines": "^1.0.1",
"node-html-parser": "^6.1.5"
"node-html-parser": "^6.1.5",
"readline-sync": "^1.4.10"
},
"devDependencies": {
"autoprefixer": "^10.4.14",
@ -2499,14 +2499,6 @@
"thenify-all": "^1.0.0"
}
},
"node_modules/n-readlines": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/n-readlines/-/n-readlines-1.0.1.tgz",
"integrity": "sha512-z4SyAIVgMy7CkgsoNw7YVz40v0g4+WWvvqy8+ZdHrCtgevcEO758WQyrYcw3XPxcLxF+//RszTz/rO48nzD0wQ==",
"engines": {
"node": ">=6.x.x"
}
},
"node_modules/nanoid": {
"version": "3.3.6",
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
@ -3189,6 +3181,14 @@
"node": ">=8.10.0"
}
},
"node_modules/readline-sync": {
"version": "1.4.10",
"resolved": "https://registry.npmjs.org/readline-sync/-/readline-sync-1.4.10.tgz",
"integrity": "sha512-gNva8/6UAe8QYepIQH/jQ2qn91Qj0B9sYjMBBs3QOB8F2CXcKgLxQaJRP76sWVRQt+QU+8fAkCbCvjjMFu7Ycw==",
"engines": {
"node": ">= 0.8.0"
}
},
"node_modules/regenerator-runtime": {
"version": "0.13.11",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",

View File

@ -29,8 +29,8 @@
"compression": "^1.7.4",
"express": "^4.18.2",
"moment": "^2.29.4",
"n-readlines": "^1.0.1",
"node-html-parser": "^6.1.5"
"node-html-parser": "^6.1.5",
"readline-sync": "^1.4.10"
},
"devDependencies": {
"autoprefixer": "^10.4.14",

130
site/js/bm25.js Normal file
View File

@ -0,0 +1,130 @@
const { readJSON, writeJSON } = require("../../analysis");
const { stem } = require("../js/stem");
const { deltaTime } = require("./misc");
const whitespaceRegex = /[^\p{Letter}\s]|_/gu;
const whitespaceRegex2 = /\s+/;
const isNumber = /^\d+\.\d+$/;
exports.tokenize = (str) => {
const name = str.toLowerCase().replace(whitespaceRegex, " ").replace("erdapfel", "kartoffel").replace("erdäpfel", "kartoffeln");
return name
.split(whitespaceRegex2)
.map((token) => stem(token))
.filter((token) => !isNumber.test(token) && token.length > 0);
};
exports.index = (items) => {
const start = performance.now();
const index = {
docs: [],
totalDocLength: 0,
averageDocLength: 0,
words: {},
k1: 1.3,
b: 0.75,
};
for (let i = 0; i < items.length; i++) {
const item = items[i];
const tokens = this.tokenize(item.name);
const doc = {
id: item.store + item.id,
body: item.name,
words: {},
wordCount: tokens.length,
};
const words = doc.words;
for (const token of tokens) {
let word = words[token];
if (!word) {
word = words[token] = {
count: 0,
frequency: 0,
};
}
word.count++;
}
for (const key in words) {
const word = words[key];
word.frequency = word.count / doc.wordCount;
let indexWord = index.words[key];
if (!indexWord) {
indexWord = index.words[key] = {
docs: [],
idf: 0,
};
}
indexWord.docs.push(i);
}
index.docs.push(doc);
index.totalDocLength += tokens.length;
index.averageDocLength = index.totalDocLength / index.docs.length;
}
var keys = Object.keys(index.words);
for (const key of keys) {
var num = index.docs.length - index.words[key].docs.length + 0.5;
var denom = index.words[key].docs.length + 0.5;
index.words[key].idf = Math.max(Math.log10(num / denom), 0.01);
}
// console.log(index);
console.log(`Words: ${Object.keys(index.words).length}`);
console.log(`Building index took: ${deltaTime(start)}`);
return index;
};
exports.search = (index, query) => {
const tokens = this.tokenize(query);
const results = [];
const candidateDocs = new Set();
for (const token of tokens) {
const word = index.words[token];
if (word) {
for (const doc of word.docs) candidateDocs.add(doc);
}
}
for (const docId of candidateDocs) {
const doc = index.docs[docId];
doc.score = 0;
for (const token of tokens) {
if (!index.words[token]) continue;
if (!doc.words[token]) continue;
const idf = index.words[token].idf;
const num = doc.words[token].count * (index.k1 + 1);
const denom = doc.words[token].count + index.k1 * (1 - index.b + (index.b * doc.wordCount) / index.averageDocLength);
doc.score += (idf * num) / denom;
}
if (!isNaN(doc.score) && doc.score > 0) {
results.push(doc);
}
}
results.sort((a, b) => b.score - a.score);
return results;
};
if (require.main === module) {
let items = readJSON("data/latest-canonical.json.br");
if (items.items) items = items.items;
console.log("Indexing ...");
const index = this.index(items);
const readline = require("readline-sync");
while (true) {
const query = readline.question("> ");
const result = this.search(index, query);
for (let i = 0; i < Math.min(result.length, 20); i++) {
const doc = result[i];
console.log(`${doc.score} ${doc.body}`);
}
console.log(`${result.length} results`);
}
}

View File

@ -117,7 +117,7 @@ exports.parseNumber = (value, defaultValue) => {
}
};
exports.queryItems = (query, items, exactWord) => {
exports.queryItemsAlasql = (query, items) => {
alasql.fn.hasPriceChange = (priceHistory, date, endDate) => {
if (!endDate) return priceHistory.some((price) => price.date == date);
else return priceHistory.some((price) => price.date >= date && price.date <= endDate);
@ -127,16 +127,16 @@ exports.queryItems = (query, items, exactWord) => {
return priceHistory.some((price) => price.date.indexOf(date) >= 0);
};
if (query.charAt(0) == "!") {
query = query.substring(1);
try {
return alasql("select * from ? where " + query, [items]);
} catch (e) {
console.error(e);
return [];
}
query = query.substring(1);
try {
return alasql("select * from ? where " + query, [items]);
} catch (e) {
console.error(e);
return [];
}
};
exports.queryItems = (query, items, exactWord) => {
let tokens = query.split(/\s+/).map((token) => token.toLowerCase().replace(",", "."));
// Find quantity/unit query

View File

@ -211,7 +211,11 @@ class ItemsFilter extends View {
}
if (query.length > 0) {
filteredItems = queryItems(query, filteredItems, elements.exact.checked);
if (query.charAt(0) == "!") {
filteredItems = queryItemsAlasql(query, filteredItems);
} else {
filteredItems = queryItems(query, filteredItems, elements.exact.checked);
}
}
if (this.model.lastQuery && this.model.lastQuery != query && !this._noChartClear) {