mirror of
https://github.com/badlogic/heissepreise.git
synced 2024-06-15 21:24:15 +02:00
More categorization work. Run node categorize.js <cart-or-items-json> to see kNN results.
This commit is contained in:
parent
a5cd0f5de8
commit
353e3cc77e
114
categorize.js
114
categorize.js
|
@ -1,66 +1,92 @@
|
|||
const fs = require("fs");
|
||||
const axios = require("axios");
|
||||
const analysis = require("./analysis");
|
||||
const stores = require("./stores");
|
||||
const utils = require("./stores/utils");
|
||||
const siteUtils = require("./site/utils");
|
||||
const HITS = Math.floor(30000 + Math.random() * 2000);
|
||||
|
||||
(async () => {
|
||||
let total = 0;
|
||||
|
||||
let result = [];
|
||||
let categories;
|
||||
if (!fs.existsSync("categories.json")) {
|
||||
categories = [];
|
||||
for (let i = 1; i <= utils.globalCategories.length; i++) {
|
||||
const categoryName = utils.globalCategories[i - 1];
|
||||
const categoryCode = i < 10 ? "" + i : String.fromCharCode("A".charCodeAt(0) + (i - 10));
|
||||
|
||||
for (let i = 1; i <= utils.globalCategories.length; i++) {
|
||||
const categoryName = utils.globalCategories[i - 1];
|
||||
const categoryCode = i < 10 ? "" + i : String.fromCharCode("A".charCodeAt(0) + (i - 10));
|
||||
|
||||
const category = {
|
||||
code: categoryCode,
|
||||
name: categoryName,
|
||||
subCategories: [],
|
||||
};
|
||||
result.push(category);
|
||||
|
||||
for (let j = 1; j <= categoryName.subcategories.length; j++) {
|
||||
const subCategoryName = categoryName.subcategories[j - 1];
|
||||
const subCategoryCode = j < 10 ? "" + j : String.fromCharCode("A".charCodeAt(0) + (j - 10));
|
||||
const code = `B2-${categoryCode}${subCategoryCode}`;
|
||||
const subCategory = {
|
||||
code: subCategoryCode,
|
||||
name: `${categoryName.name}>${subCategoryName}`,
|
||||
items: [],
|
||||
const category = {
|
||||
code: categoryCode,
|
||||
name: categoryName,
|
||||
subCategories: [],
|
||||
};
|
||||
category.subCategories.push(subCategory);
|
||||
categories.push(category);
|
||||
|
||||
console.log("Fetching items for category " + code + ` ${categoryName.name} > ${subCategoryName}`);
|
||||
const BILLA_SEARCH = `https://shop.billa.at/api/search/full?searchTerm=*&storeId=00-10&pageSize=${HITS}&category=${code}`;
|
||||
const data = (await axios.get(BILLA_SEARCH)).data;
|
||||
data.tiles.forEach((item) => {
|
||||
try {
|
||||
const canonicalItem = stores.billa.getCanonical(item);
|
||||
canonicalItem.categoryCode = `${categoryCode}${subCategoryCode}`;
|
||||
canonicalItem.categoryName = `${categoryName.name} > ${subCategoryName}`;
|
||||
subCategory.items.push(canonicalItem);
|
||||
} catch (e) {
|
||||
// Ignore super tiles
|
||||
}
|
||||
});
|
||||
total += subCategory.items.length;
|
||||
console.log(subCategory.items.length + " items");
|
||||
for (let j = 1; j <= categoryName.subcategories.length; j++) {
|
||||
const subCategoryName = categoryName.subcategories[j - 1];
|
||||
const subCategoryCode = j < 10 ? "" + j : String.fromCharCode("A".charCodeAt(0) + (j - 10));
|
||||
const code = `B2-${categoryCode}${subCategoryCode}`;
|
||||
const subCategory = {
|
||||
code: subCategoryCode,
|
||||
name: `${categoryName.name}>${subCategoryName}`,
|
||||
items: [],
|
||||
};
|
||||
category.subCategories.push(subCategory);
|
||||
|
||||
console.log("Fetching items for category " + code + ` ${categoryName.name} > ${subCategoryName}`);
|
||||
const BILLA_SEARCH = `https://shop.billa.at/api/search/full?searchTerm=*&storeId=00-10&pageSize=${HITS}&category=${code}`;
|
||||
const data = (await axios.get(BILLA_SEARCH)).data;
|
||||
data.tiles.forEach((item) => {
|
||||
try {
|
||||
const canonicalItem = stores.billa.getCanonical(item);
|
||||
canonicalItem.categoryCode = `${categoryCode}${subCategoryCode}`;
|
||||
canonicalItem.categoryName = `${categoryName.name} > ${subCategoryName}`;
|
||||
if (canonicalItem.name.toLowerCase().indexOf("erdapfel")) canonicalItem.name + " kartoffel";
|
||||
if (canonicalItem.name.toLowerCase().indexOf("erdäpfel")) canonicalItem.name + " kartoffeln";
|
||||
subCategory.items.push(canonicalItem);
|
||||
} catch (e) {
|
||||
// Ignore super tiles
|
||||
}
|
||||
});
|
||||
total += subCategory.items.length;
|
||||
console.log(subCategory.items.length + " items");
|
||||
}
|
||||
}
|
||||
console.log("Total: " + total);
|
||||
analysis.writeJSON("categories.json", categories);
|
||||
} else {
|
||||
categories = analysis.readJSON("categories.json");
|
||||
}
|
||||
console.log("Total: " + total);
|
||||
analysis.writeJSON("categories.json", result);
|
||||
|
||||
const items = [];
|
||||
const lookup = {};
|
||||
for (const category of result) {
|
||||
for (const category of categories) {
|
||||
for (const subCategory of category.subCategories) {
|
||||
for (const item of subCategory.items) {
|
||||
if (lookup[item.id]) {
|
||||
console.log(`Duplicate item: ${item.name} in category ${item.categoryName} and ${lookup[item.id].categoryName}`);
|
||||
} else {
|
||||
lookup[(item.id = item)];
|
||||
}
|
||||
items.push(item);
|
||||
lookup[item.id] = item;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const vectorize = true;
|
||||
console.log("Vectorizing items");
|
||||
siteUtils.vectorizeItems(items, vectorize);
|
||||
|
||||
console.log("Categorizing items");
|
||||
const file = process?.argv?.[2] ?? "site/data/momentum-cart.json";
|
||||
let momentumItems = analysis.readJSON(file);
|
||||
if (momentumItems.items) momentumItems = momentumItems.items;
|
||||
siteUtils.vectorizeItems(momentumItems, vectorize);
|
||||
const start = performance.now();
|
||||
for (const item of momentumItems) {
|
||||
const similar = siteUtils.findMostSimilarItems(item, items, 9);
|
||||
console.log(`${item.name}`);
|
||||
similar.sort((a, b) => b.similarity - a.similarity);
|
||||
similar.forEach((s) => console.log(`${s.item.categoryName}, ${s.item.name}, ${s.similarity}`));
|
||||
console.log();
|
||||
}
|
||||
const took = (performance.now() - start) / 1000;
|
||||
console.log("Took: " + took.toFixed(3) + ", " + (momentumItems.length / took).toFixed(2) + "/s");
|
||||
})();
|
||||
|
|
|
@ -1141,38 +1141,81 @@ function magnitude(vector) {
|
|||
return Math.sqrt(sumOfSquares);
|
||||
}
|
||||
|
||||
function findMostSimilarItem(refItem, items) {
|
||||
let maxSimilarity = -1;
|
||||
let similarItem = null;
|
||||
let similarItemIdx = -1;
|
||||
items.forEach((item, idx) => {
|
||||
let similarity = dotProduct(refItem.vector, item.vector);
|
||||
if (similarity > maxSimilarity) {
|
||||
maxSimilarity = similarity;
|
||||
similarItem = item;
|
||||
similarItemIdx = idx;
|
||||
}
|
||||
});
|
||||
return {
|
||||
similarity: maxSimilarity,
|
||||
item: similarItem,
|
||||
index: similarItemIdx,
|
||||
};
|
||||
}
|
||||
|
||||
function findMostSimilarItems(refItem, items, k = 5) {
|
||||
let topSimilarItems = [];
|
||||
let topSimilarities = [];
|
||||
|
||||
items.forEach((item, idx) => {
|
||||
let similarity = dotProduct(refItem.vector, item.vector);
|
||||
|
||||
if (topSimilarItems.length < k) {
|
||||
topSimilarItems.push(item);
|
||||
topSimilarities.push(similarity);
|
||||
} else {
|
||||
let minSimilarity = Math.min(...topSimilarities);
|
||||
let minIndex = topSimilarities.indexOf(minSimilarity);
|
||||
|
||||
if (similarity > minSimilarity) {
|
||||
topSimilarItems[minIndex] = item;
|
||||
topSimilarities[minIndex] = similarity;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let similarItemsWithIndices = topSimilarItems.map((item, index) => {
|
||||
return {
|
||||
similarity: topSimilarities[index],
|
||||
item: item,
|
||||
index: items.indexOf(item),
|
||||
};
|
||||
});
|
||||
|
||||
return similarItemsWithIndices;
|
||||
}
|
||||
|
||||
function similaritySortItems(items) {
|
||||
if (items.length == 0) return items;
|
||||
sortedItems = [items.shift()];
|
||||
let refItem = sortedItems[0];
|
||||
while (items.length > 0) {
|
||||
let maxSimilarity = -1;
|
||||
let similarItem = null;
|
||||
let similarItemIdx = -1;
|
||||
items.forEach((item, idx) => {
|
||||
let similarity = dotProduct(refItem.vector, item.vector);
|
||||
if (similarity > maxSimilarity) {
|
||||
maxSimilarity = similarity;
|
||||
similarItem = item;
|
||||
similarItemIdx = idx;
|
||||
}
|
||||
});
|
||||
sortedItems.push(similarItem);
|
||||
items.splice(similarItemIdx, 1);
|
||||
refItem = similarItem;
|
||||
const similarItem = findMostSimilarItem(refItem, items).item;
|
||||
sortedItems.push(similarItem.item);
|
||||
items.splice(similarItem.index, 1);
|
||||
refItem = similarItem.item;
|
||||
}
|
||||
return sortedItems;
|
||||
}
|
||||
|
||||
function vectorizeItems(items) {
|
||||
function vectorizeItems(items, useUnit = true) {
|
||||
items.forEach((item) => {
|
||||
let name = item.name
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s]|_/g, "")
|
||||
.replace("-", " ");
|
||||
item.tokens = name.split(/\s+/).map((token) => stem(token));
|
||||
if (item.quantity) item.tokens.push("" + item.quantity);
|
||||
if (item.unit) item.tokens.push(item.unit);
|
||||
if (useUnit) {
|
||||
if (item.quantity) item.tokens.push("" + item.quantity);
|
||||
if (item.unit) item.tokens.push(item.unit);
|
||||
}
|
||||
item.vector = vector(item.tokens);
|
||||
});
|
||||
}
|
||||
|
@ -1189,9 +1232,9 @@ try {
|
|||
exports.scaleVector = scaleVector;
|
||||
exports.normalizeVector = normalizeVector;
|
||||
exports.stem = stem;
|
||||
exports.cluster = cluster;
|
||||
exports.flattenClusters = flattenClusters;
|
||||
exports.vectorizeItems = vectorizeItems;
|
||||
exports.findMostSimilarItem = findMostSimilarItem;
|
||||
exports.findMostSimilarItems = findMostSimilarItems;
|
||||
exports.similaritySortItems = similaritySortItems;
|
||||
} catch (e) {
|
||||
// hax
|
||||
|
|
Loading…
Reference in New Issue
Block a user