From 353e3cc77ecd6f0e016d2096591a5516c604f24b Mon Sep 17 00:00:00 2001 From: Mario Zechner Date: Tue, 6 Jun 2023 16:18:01 +0200 Subject: [PATCH] More categorization work. Run node categorize.js to see kNN results. --- categorize.js | 114 +++++++++++++++++++++++++++++++------------------- site/utils.js | 81 ++++++++++++++++++++++++++--------- 2 files changed, 132 insertions(+), 63 deletions(-) diff --git a/categorize.js b/categorize.js index 941ac7e..e3749e2 100644 --- a/categorize.js +++ b/categorize.js @@ -1,66 +1,92 @@ +const fs = require("fs"); const axios = require("axios"); const analysis = require("./analysis"); const stores = require("./stores"); const utils = require("./stores/utils"); +const siteUtils = require("./site/utils"); const HITS = Math.floor(30000 + Math.random() * 2000); (async () => { let total = 0; - let result = []; + let categories; + if (!fs.existsSync("categories.json")) { + categories = []; + for (let i = 1; i <= utils.globalCategories.length; i++) { + const categoryName = utils.globalCategories[i - 1]; + const categoryCode = i < 10 ? "" + i : String.fromCharCode("A".charCodeAt(0) + (i - 10)); - for (let i = 1; i <= utils.globalCategories.length; i++) { - const categoryName = utils.globalCategories[i - 1]; - const categoryCode = i < 10 ? "" + i : String.fromCharCode("A".charCodeAt(0) + (i - 10)); - - const category = { - code: categoryCode, - name: categoryName, - subCategories: [], - }; - result.push(category); - - for (let j = 1; j <= categoryName.subcategories.length; j++) { - const subCategoryName = categoryName.subcategories[j - 1]; - const subCategoryCode = j < 10 ? "" + j : String.fromCharCode("A".charCodeAt(0) + (j - 10)); - const code = `B2-${categoryCode}${subCategoryCode}`; - const subCategory = { - code: subCategoryCode, - name: `${categoryName.name}>${subCategoryName}`, - items: [], + const category = { + code: categoryCode, + name: categoryName, + subCategories: [], }; - category.subCategories.push(subCategory); + categories.push(category); - console.log("Fetching items for category " + code + ` ${categoryName.name} > ${subCategoryName}`); - const BILLA_SEARCH = `https://shop.billa.at/api/search/full?searchTerm=*&storeId=00-10&pageSize=${HITS}&category=${code}`; - const data = (await axios.get(BILLA_SEARCH)).data; - data.tiles.forEach((item) => { - try { - const canonicalItem = stores.billa.getCanonical(item); - canonicalItem.categoryCode = `${categoryCode}${subCategoryCode}`; - canonicalItem.categoryName = `${categoryName.name} > ${subCategoryName}`; - subCategory.items.push(canonicalItem); - } catch (e) { - // Ignore super tiles - } - }); - total += subCategory.items.length; - console.log(subCategory.items.length + " items"); + for (let j = 1; j <= categoryName.subcategories.length; j++) { + const subCategoryName = categoryName.subcategories[j - 1]; + const subCategoryCode = j < 10 ? "" + j : String.fromCharCode("A".charCodeAt(0) + (j - 10)); + const code = `B2-${categoryCode}${subCategoryCode}`; + const subCategory = { + code: subCategoryCode, + name: `${categoryName.name}>${subCategoryName}`, + items: [], + }; + category.subCategories.push(subCategory); + + console.log("Fetching items for category " + code + ` ${categoryName.name} > ${subCategoryName}`); + const BILLA_SEARCH = `https://shop.billa.at/api/search/full?searchTerm=*&storeId=00-10&pageSize=${HITS}&category=${code}`; + const data = (await axios.get(BILLA_SEARCH)).data; + data.tiles.forEach((item) => { + try { + const canonicalItem = stores.billa.getCanonical(item); + canonicalItem.categoryCode = `${categoryCode}${subCategoryCode}`; + canonicalItem.categoryName = `${categoryName.name} > ${subCategoryName}`; + if (canonicalItem.name.toLowerCase().indexOf("erdapfel")) canonicalItem.name + " kartoffel"; + if (canonicalItem.name.toLowerCase().indexOf("erdäpfel")) canonicalItem.name + " kartoffeln"; + subCategory.items.push(canonicalItem); + } catch (e) { + // Ignore super tiles + } + }); + total += subCategory.items.length; + console.log(subCategory.items.length + " items"); + } } + console.log("Total: " + total); + analysis.writeJSON("categories.json", categories); + } else { + categories = analysis.readJSON("categories.json"); } - console.log("Total: " + total); - analysis.writeJSON("categories.json", result); + const items = []; const lookup = {}; - for (const category of result) { + for (const category of categories) { for (const subCategory of category.subCategories) { for (const item of subCategory.items) { - if (lookup[item.id]) { - console.log(`Duplicate item: ${item.name} in category ${item.categoryName} and ${lookup[item.id].categoryName}`); - } else { - lookup[(item.id = item)]; - } + items.push(item); + lookup[item.id] = item; } } } + + const vectorize = true; + console.log("Vectorizing items"); + siteUtils.vectorizeItems(items, vectorize); + + console.log("Categorizing items"); + const file = process?.argv?.[2] ?? "site/data/momentum-cart.json"; + let momentumItems = analysis.readJSON(file); + if (momentumItems.items) momentumItems = momentumItems.items; + siteUtils.vectorizeItems(momentumItems, vectorize); + const start = performance.now(); + for (const item of momentumItems) { + const similar = siteUtils.findMostSimilarItems(item, items, 9); + console.log(`${item.name}`); + similar.sort((a, b) => b.similarity - a.similarity); + similar.forEach((s) => console.log(`${s.item.categoryName}, ${s.item.name}, ${s.similarity}`)); + console.log(); + } + const took = (performance.now() - start) / 1000; + console.log("Took: " + took.toFixed(3) + ", " + (momentumItems.length / took).toFixed(2) + "/s"); })(); diff --git a/site/utils.js b/site/utils.js index 022f2f9..812ab21 100644 --- a/site/utils.js +++ b/site/utils.js @@ -1141,38 +1141,81 @@ function magnitude(vector) { return Math.sqrt(sumOfSquares); } +function findMostSimilarItem(refItem, items) { + let maxSimilarity = -1; + let similarItem = null; + let similarItemIdx = -1; + items.forEach((item, idx) => { + let similarity = dotProduct(refItem.vector, item.vector); + if (similarity > maxSimilarity) { + maxSimilarity = similarity; + similarItem = item; + similarItemIdx = idx; + } + }); + return { + similarity: maxSimilarity, + item: similarItem, + index: similarItemIdx, + }; +} + +function findMostSimilarItems(refItem, items, k = 5) { + let topSimilarItems = []; + let topSimilarities = []; + + items.forEach((item, idx) => { + let similarity = dotProduct(refItem.vector, item.vector); + + if (topSimilarItems.length < k) { + topSimilarItems.push(item); + topSimilarities.push(similarity); + } else { + let minSimilarity = Math.min(...topSimilarities); + let minIndex = topSimilarities.indexOf(minSimilarity); + + if (similarity > minSimilarity) { + topSimilarItems[minIndex] = item; + topSimilarities[minIndex] = similarity; + } + } + }); + + let similarItemsWithIndices = topSimilarItems.map((item, index) => { + return { + similarity: topSimilarities[index], + item: item, + index: items.indexOf(item), + }; + }); + + return similarItemsWithIndices; +} + function similaritySortItems(items) { if (items.length == 0) return items; sortedItems = [items.shift()]; let refItem = sortedItems[0]; while (items.length > 0) { - let maxSimilarity = -1; - let similarItem = null; - let similarItemIdx = -1; - items.forEach((item, idx) => { - let similarity = dotProduct(refItem.vector, item.vector); - if (similarity > maxSimilarity) { - maxSimilarity = similarity; - similarItem = item; - similarItemIdx = idx; - } - }); - sortedItems.push(similarItem); - items.splice(similarItemIdx, 1); - refItem = similarItem; + const similarItem = findMostSimilarItem(refItem, items).item; + sortedItems.push(similarItem.item); + items.splice(similarItem.index, 1); + refItem = similarItem.item; } return sortedItems; } -function vectorizeItems(items) { +function vectorizeItems(items, useUnit = true) { items.forEach((item) => { let name = item.name .toLowerCase() .replace(/[^\w\s]|_/g, "") .replace("-", " "); item.tokens = name.split(/\s+/).map((token) => stem(token)); - if (item.quantity) item.tokens.push("" + item.quantity); - if (item.unit) item.tokens.push(item.unit); + if (useUnit) { + if (item.quantity) item.tokens.push("" + item.quantity); + if (item.unit) item.tokens.push(item.unit); + } item.vector = vector(item.tokens); }); } @@ -1189,9 +1232,9 @@ try { exports.scaleVector = scaleVector; exports.normalizeVector = normalizeVector; exports.stem = stem; - exports.cluster = cluster; - exports.flattenClusters = flattenClusters; exports.vectorizeItems = vectorizeItems; + exports.findMostSimilarItem = findMostSimilarItem; + exports.findMostSimilarItems = findMostSimilarItems; exports.similaritySortItems = similaritySortItems; } catch (e) { // hax