Categories for Billa & Spar, infra to add catgories for other stores.

Billa maps directly to the canonical categories. Spar uses a mapping file stores/spar-categories.json.

Each store has a generateCategoryMapping() function which is called once in analysis.js:updateData() and analysis.js:replay(). The function is responsible for

* Fetching the latest categories
* Merging them with already mapped categories
* Report new categories that haven't been mapped yet
* Report categories that have been mapped but are no longer part of the latest set of categories
* Save the merged mappings to disk

This schema might not work for all stores, in which case updateData() and replay() will use a knn approach to figure out the category for an item. See #81
This commit is contained in:
Mario Zechner 2023-06-21 01:28:38 +02:00
parent 55b8a79107
commit 303d25ccb5
20 changed files with 3328 additions and 1291 deletions

View File

@ -2,9 +2,7 @@ const fs = require("fs");
const fsAsync = require("fs").promises;
const zlib = require("zlib");
const stores = require("./stores");
const { FILE } = require("dns");
const { promisify } = require("util");
const { dateToUint16 } = require("./site/js/misc");
const STORE_KEYS = Object.keys(stores);
exports.STORE_KEYS = STORE_KEYS;
@ -149,7 +147,7 @@ function sortItems(items) {
});
}
// Keep this in sync with utils.js:decompress
// Keep this in sync with items.js:decompress
function compress(items) {
const compressed = {
stores: STORE_KEYS,
@ -170,6 +168,7 @@ function compress(items) {
data.push(STORE_KEYS.indexOf(item.store));
data.push(item.id);
data.push(item.name);
data.push(item.category ?? "A0");
data.push(item.priceHistory.length);
for (price of item.priceHistory) {
data.push(uniqueDates[price.date.replaceAll("-", "")]);
@ -179,7 +178,7 @@ function compress(items) {
data.push(item.quantity);
data.push(item.isWeighted ? 1 : 0);
data.push(item.bio ? 1 : 0);
data.push(item.url?.replace(stores[item.store].urlBase, ""));
data.push(item.url?.replace(stores[item.store].urlBase, "") ?? "");
}
return compressed;
}
@ -210,10 +209,11 @@ exports.replay = function (rawDataDir) {
const canonicalFiles = {};
for (const store of STORE_KEYS) {
stores[store].generateCategoryMapping();
storeFiles[store] = getFilteredFilesFor(store);
canonicalFiles[store] = storeFiles[store].map((file) => {
console.log(`Creating canonical items for ${file}`);
return getCanonicalFor(store, readJSON(file), file.match(/\d{4}-\d{2}-\d{2}/)[0]);
getCanonicalFor(store, readJSON(file), file.match(/\d{4}-\d{2}-\d{2}/)[0]);
});
canonicalFiles[store].reverse();
}
@ -248,6 +248,8 @@ exports.updateData = async function (dataDir, done) {
console.log("Fetching data for date: " + today);
const storeFetchPromises = [];
for (const store of STORE_KEYS) {
await stores[store].initializeCategoryMapping();
storeFetchPromises.push(
new Promise(async (resolve) => {
const start = performance.now();
@ -261,7 +263,18 @@ exports.updateData = async function (dataDir, done) {
writeJSON(rawDataFile, storeItems, FILE_COMPRESSOR);
}
const storeItemsCanonical = getCanonicalFor(store, storeItems, today);
console.log(`Fetched ${store.toUpperCase()} data, took ${(performance.now() - start) / 1000} seconds`);
let numUncategorized = 0;
for (let i = 0; i < storeItemsCanonical.length; i++) {
const rawItem = storeItems[i];
const item = storeItemsCanonical[i];
item.category = stores[store].mapCategory(rawItem);
if (item.category == null) numUncategorized++;
}
console.log(
`Fetched ${store.toUpperCase()} data, took ${(performance.now() - start) / 1000} seconds, ${numUncategorized}/${
storeItemsCanonical.length
} items without category.`
);
resolve(storeItemsCanonical);
} catch (e) {
console.error(`Error while fetching data from ${store}, continuing after ${(performance.now() - start) / 1000} seconds...`, e);

View File

@ -14,6 +14,7 @@ function copyItemsToSite(dataDir) {
const storeItems = items.filter((item) => item.store === store);
analysis.writeJSON(`site/output/data/latest-canonical.${store}.compressed.json`, storeItems, false, 0, true);
}
console.log("Copied latest items to site.");
}
function scheduleFunction(hour, minute, second, func) {

View File

@ -207,5 +207,6 @@ function loadCart() {
cartList.model = cartFilter.model = cart;
productsList.model = productsFilter.model = models.items;
if (c || d) itemsChart.render();
cartFilter.filter();
document.querySelector('[x-id="loader"]').classList.add("hidden");
})();

View File

@ -149,6 +149,8 @@ exports.categories = [
},
];
exports.categories.forEach((category, index) => (category.index = index));
exports.toCategoryCode = (i, j) => {
return (
(i < 10 ? "" + i : String.fromCharCode("A".charCodeAt(0) + (i - 10))) + (j < 10 ? "" + j : String.fromCharCode("A".charCodeAt(0) + (j - 10)))
@ -156,7 +158,7 @@ exports.toCategoryCode = (i, j) => {
};
exports.fromCategoryCode = (code) => {
if (code.length != 2) return [exports.categories.length - 1, 0];
if (!code || code.length != 2) return [exports.categories.length - 1, 0];
const codeI = code.charCodeAt(0);
const codeJ = code.charCodeAt(1);
return [

View File

@ -175,6 +175,7 @@ class Items extends Model {
const store = storeLookup[data[i++]];
const id = data[i++];
const name = data[i++];
const category = data[i++];
const numPrices = data[i++];
const prices = new Array(numPrices);
for (let j = 0; j < numPrices; j++) {
@ -195,6 +196,7 @@ class Items extends Model {
store,
id,
name,
category,
price: prices[0].price,
priceHistory: prices,
isWeighted,

View File

@ -13,12 +13,19 @@ class CustomCheckbox extends View {
<svg class="h-2 w-2 stroke-gray-600 fill-gray-100 peer-checked:fill-gray-600" viewBox="0 0 6 6">
<circle cx="3" cy="3" r="2" />
</svg>
${this.hasAttribute("abbr") ? `<abbr title="${abbr}">${label}</abbr>` : label}
${
this.hasAttribute("abbr")
? `<abbr x-id="label" title="${abbr}"><span x-id="label">${label}</span></abbr>`
: `<span x-id="label">${label}</span>`
}
</label>
`;
this.classList.add("customcheckbox");
this._checkbox = View.elements(this).checkbox;
this.setupEventHandlers();
this._checkbox.addEventListener("change", (event) => {
event.stopPropagation();
this.fireChangeEvent();
});
}
get checkbox() {
@ -32,5 +39,9 @@ class CustomCheckbox extends View {
set checked(value) {
this._checkbox.checked = value;
}
set label(value) {
this.elements.label.innerText = value;
}
}
customElements.define("custom-checkbox", CustomCheckbox);

View File

@ -1,5 +1,6 @@
const { today, parseNumber, dom, getBooleanAttribute, queryItems, queryItemsAlasql, log, deltaTime } = require("../js/misc");
const { stores, STORE_KEYS, BUDGET_BRANDS } = require("../model/stores");
const { fromCategoryCode, categories } = require("../model/categories");
const { settings } = require("../model");
const { View } = require("./view");
@ -69,6 +70,19 @@ class ItemsFilter extends View {
<custom-checkbox x-id="priceIncreased" x-state x-change label="Teurer" checked class="gray"></custom-checkbox>
<custom-checkbox x-id="priceDecreased" x-state x-change label="Billiger" checked class="gray"></custom-checkbox>
</div>
<div x-id="categories" class="flex justify-center gap-2 flex-wrap mt-4 hidden">
${categories
.map(
(category, index) => /*html*/ `
<custom-checkbox
x-id="category-${index}" x-state x-change
label="${category.name}"
checked
></custom-checkbox>`
)
.join("")}
</div>
`;
this.classList.add("items-filter");
@ -102,7 +116,7 @@ class ItemsFilter extends View {
})
);
elements.allStores.addEventListener("change", handleChangeAll);
elements.allStores.addEventListener("x-change", handleChangeAll);
elements.priceChangesToday.addEventListener("change", () => {
if (elements.priceChangesToday.checked) elements.priceDirection.classList.remove("hidden");
@ -218,11 +232,42 @@ class ItemsFilter extends View {
}
}
if (this.model.lastQuery && this.model.lastQuery != query && !this._noChartClear) {
let queryChanged = this.model.lastQuery && this.model.lastQuery != query;
if (queryChanged && !this._noChartClear) {
filteredItems.forEach((item) => (item.chart = false));
}
this.model.lastQuery = query;
if (this.model.numItemsBeforeCategories != filteredItems.length) queryChanged = true;
this.model.numItemsBeforeCategories = filteredItems.length; // This is not entirely correct, but I'm too lazy...
const filteredCategories = {};
filteredItems.forEach((item) => {
const category = categories[fromCategoryCode(item.category)[0]];
filteredCategories[category.index] = filteredCategories[category.index] ? filteredCategories[category.index] + 1 : 1;
});
for (const category of categories) {
const checkbox = elements["category-" + category.index];
if (filteredCategories[category.index] > 0) {
if (queryChanged) checkbox.checked = true;
checkbox.label = `${category.name} (${filteredCategories[category.index]})`;
checkbox.classList.remove("hidden");
} else {
if (queryChanged) checkbox.checked = false;
checkbox.classList.add("hidden");
}
}
if (Object.keys(filteredCategories).length == 0) {
elements.categories.classList.add("hidden");
} else {
elements.categories.classList.remove("hidden");
}
filteredItems = filteredItems.filter((item) => {
const category = categories[fromCategoryCode(item.category)[0]];
return elements["category-" + category.index].checked;
});
log(`ItemsFilter - Filtering ${this.model.items.length} took ${deltaTime(start).toFixed(4)} secs, ${filteredItems.length} results.`);
this.model.removeListener(this._listener);

View File

@ -23,27 +23,11 @@ exports.getCanonical = function (item, today) {
if (grammage) [quantity, unit] = grammage.trim().split(" ").splice(0, 2);
}
let billaCategory = null;
for (const groupId of item.data.articleGroupIds) {
if (billaCategory == null) {
billaCategory = groupId;
continue;
}
if (groupId.charCodeAt(3) < billaCategory.charCodeAt(3)) {
billaCategory = groupId;
}
}
let categoryCode = billaCategory.replace("B2-", "").substring(0, 2);
let [ci, cj] = fromCategoryCode(categoryCode);
categoryCode = toCategoryCode(ci - 1, cj - 1);
return utils.convertUnit(
{
id: item.data.articleId,
name: item.data.name,
description: item.data.description ?? "",
categoryCode,
price: item.data.price.final,
priceHistory: [{ date: today, price: item.data.price.final }],
isWeighted: item.data.isWeightArticle,
@ -90,4 +74,27 @@ exports.fetchData = async function () {
return items;
};
exports.initializeCategoryMapping = async () => {
// FIXME check if categories have changed.
console.log("No mapping for Billa");
};
exports.mapCategory = (rawItem) => {
let billaCategory = null;
for (const groupId of rawItem.data.articleGroupIds) {
if (billaCategory == null) {
billaCategory = groupId;
continue;
}
if (groupId.charCodeAt(3) < billaCategory.charCodeAt(3)) {
billaCategory = groupId;
}
}
let categoryCode = billaCategory.replace("B2-", "").substring(0, 2);
let [ci, cj] = fromCategoryCode(categoryCode);
categoryCode = toCategoryCode(ci - 1, cj - 1);
return categoryCode;
};
exports.urlBase = "https://shop.billa.at";

View File

@ -93,4 +93,8 @@ exports.fetchData = async function () {
return dmItems;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.dm.de/product-p";

View File

@ -91,4 +91,8 @@ exports.fetchData = async function () {
return dmItems;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.dm.at/product-p";

View File

@ -1,3 +1,5 @@
const fs = require("fs");
const path = require("path");
const axios = require("axios");
const utils = require("./utils");
@ -35,31 +37,21 @@ exports.getCanonical = function (item, today) {
);
};
const HOFER_BASE_URL = `https://shopservice.roksh.at`;
const CATEGORIES = HOFER_BASE_URL + `/category/GetFullCategoryList/`;
const CONFIG = { headers: { authorization: null } };
const TOKEN_DATA = {
OwnWebshopProviderCode: "",
SetUserSelectedShopsOnFirstSiteLoad: true,
RedirectToDashboardNeeded: false,
ShopsSelectedForRoot: "hofer",
BrandProviderSelectedForRoot: null,
UserSelectedShops: [],
};
const ITEMS = HOFER_BASE_URL + `/productlist/CategoryProductList`;
exports.fetchData = async function () {
const HOFER_BASE_URL = `https://shopservice.roksh.at`;
const CATEGORIES = HOFER_BASE_URL + `/category/GetFullCategoryList/`;
const CONFIG = { headers: { authorization: null } };
const ITEMS = HOFER_BASE_URL + `/productlist/CategoryProductList`;
// fetch access token
const token_data = {
OwnWebshopProviderCode: "",
SetUserSelectedShopsOnFirstSiteLoad: true,
RedirectToDashboardNeeded: false,
ShopsSelectedForRoot: "hofer",
BrandProviderSelectedForRoot: null,
UserSelectedShops: [],
};
const token = (
await axios.post("https://shopservice.roksh.at/session/configure", token_data, {
headers: { Accept: "application/json", "Content-Type": "application/json" },
})
).headers["jwt-auth"];
CONFIG.headers.authorization = "Bearer " + token;
// concat all subcategories (categories.[i].ChildList)
const categories = (await axios.post(CATEGORIES, {}, CONFIG)).data;
const subCategories = categories.reduce((acc, category) => acc.concat(category.ChildList), []);
const { subCategories } = await exports.getCategories();
let hoferItems = [];
for (let subCategory of subCategories) {
@ -78,4 +70,73 @@ exports.fetchData = async function () {
return hoferItems;
};
exports.getCategories = async () => {
const token = (
await axios.post("https://shopservice.roksh.at/session/configure", TOKEN_DATA, {
headers: { Accept: "application/json", "Content-Type": "application/json" },
})
).headers["jwt-auth"];
CONFIG.headers.authorization = "Bearer " + token;
// concat all subcategories (categories.[i].ChildList)
const categories = (await axios.post(CATEGORIES, {}, CONFIG)).data;
const subCategories = categories.reduce((acc, category) => acc.concat(category.ChildList), []);
return { categories, subCategories };
};
exports.urlBase = "https://www.roksh.at/hofer/produkte/";
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.generateCategoryMapping = async (rawItems) => {
const { categories } = await exports.getCategories();
const lookup = {};
const processCategory = (category) => {
lookup[category.ProgID] = {
category: category.ProgID,
url: category.Url,
code: "",
numItems: 0,
};
for (const child of category.ChildList) {
processCategory(child);
}
};
for (const category of categories) {
processCategory(category);
}
let total = 0;
for (const item of rawItems) {
if (!lookup[item.CategorySEOName]) {
console.log(`Couldn't find category '${item.CategorySEOName}' for Hofer product ${item.ProductName}`);
total++;
lookup[item.CategorySEOName] = {
category: item.CategorySEOName,
url: "",
code: "",
numItems: 1,
};
} else {
const category = lookup[item.CategorySEOName];
category.item = item;
category.numItems++;
}
}
const output = Object.keys(lookup).map((key) => lookup[key]);
const oldCategories = path.join(__dirname, "hofer-categories.json");
fs.writeFileSync(path.join(__dirname, "hofer-categories.json"), JSON.stringify(output, null, 2));
};
// Generate JSON for category mapping in stores/hofer-categories.json
if (require.main === module) {
(async () => {
const { readJSON } = require("../analysis");
// const rawItems = await this.fetchData();
const rawItems = readJSON("data/hofer-2023-06-20.json.br");
await exports.generateCategoryMapping(rawItems);
})();
}

View File

@ -57,4 +57,8 @@ exports.fetchData = async function () {
return (await axios.get(LIDL_SEARCH)).data.filter((item) => !!item.price.price);
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.lidl.at";

View File

@ -58,4 +58,8 @@ exports.fetchData = async function () {
return mpreisItems;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.mpreis.at/shop/p/";

View File

@ -47,4 +47,8 @@ exports.fetchData = async function () {
return result;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.penny.at/produkte/";

View File

@ -111,4 +111,8 @@ exports.fetchData = async function () {
}
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "";

3002
stores/spar-categories.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -83,4 +83,8 @@ exports.fetchData = async function () {
return rawItems?.hits || rawItems;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://www.spar.si/online";

File diff suppressed because it is too large Load Diff

View File

@ -67,4 +67,8 @@ exports.fetchData = async function () {
return unimarktItems;
};
exports.initializeCategoryMapping = async () => {};
exports.mapCategory = (rawItem) => {};
exports.urlBase = "https://shop.unimarkt.at";

View File

@ -1,3 +1,6 @@
const fs = require("fs");
const path = require("path");
// These are a match of the Billa categories, which are organized in a 2-level hierarchy.
// Each category in the top level gets a code from 1-Z, each sub category also gets a code.
// Together the two codes from a unique id for the category, which we store in the item.category
@ -115,6 +118,37 @@ exports.globalCategories = [
},
];
exports.mergeAndSaveCategories = (store, categories) => {
const mappingFile = path.join(__dirname, `${store}-categories.json`);
if (fs.existsSync(mappingFile)) {
const oldMapping = JSON.parse(fs.readFileSync(mappingFile));
const oldLookup = {};
for (const category of oldMapping) {
oldLookup[category.id] = category;
}
for (const category of categories) {
const oldCategory = oldLookup[category.id];
if (oldCategory == null) {
console.log(`Found new unmapped category for ${store}: ${category.id} - ${category.description}`);
} else {
category.code = oldCategory.code;
delete oldLookup[category.id];
}
}
if (Object.keys(oldLookup).length > 0) {
for (const key in oldLookup) {
const category = oldLookup[key];
console.log(`Found category absent in latest mapping for ${store}: ${category.id} - ${category.description}`);
categories.push(category);
}
}
}
fs.writeFileSync(mappingFile, JSON.stringify(categories, null, 2));
return categories;
};
exports.globalUnits = {
"stk.": { unit: "stk", factor: 1 },
blatt: { unit: "stk", factor: 1 },