heissepreise/analysis.js

215 lines
6.6 KiB
JavaScript
Raw Normal View History

2023-05-17 16:17:45 +02:00
const fs = require("fs");
const zlib = require("zlib");
2023-05-25 13:54:28 +02:00
const stores = require("./stores");
const STORE_KEYS = Object.keys(stores);
2023-05-17 16:17:45 +02:00
2023-05-30 13:08:43 +02:00
exports.STORE_KEYS = STORE_KEYS;
2023-05-17 16:17:45 +02:00
function currentDate() {
const currentDate = new Date();
const year = currentDate.getFullYear();
const month = String(currentDate.getMonth() + 1).padStart(2, "0");
const day = String(currentDate.getDate()).padStart(2, "0");
2023-05-17 16:17:45 +02:00
return `${year}-${month}-${day}`;
}
function readJSON(file, gzipped = false) {
let data = fs.readFileSync(`${file}${gzipped ? ".gz" : ""}`)
if (gzipped) data = zlib.gunzipSync(data);
return JSON.parse(data);
2023-05-17 16:17:45 +02:00
}
exports.readJSON = readJSON;
2023-05-17 16:17:45 +02:00
function writeJSON(file, data, gzipped = false, spacer = 2, compressData = false) {
if (compressData) {
data = compress(data);
}
data = JSON.stringify(data, null, spacer);
if (gzipped) data = zlib.gzipSync(data);
fs.writeFileSync(`${file}${gzipped ? ".gz" : ""}`, data);
2023-05-17 16:17:45 +02:00
}
exports.writeJSON = writeJSON;
2023-05-17 16:17:45 +02:00
2023-05-25 13:54:28 +02:00
function getCanonicalFor(store, rawItems, today) {
2023-05-24 20:02:45 +02:00
const canonicalItems = [];
for (let i = 0; i < rawItems.length; i++) {
2023-05-25 13:54:28 +02:00
const item = stores[store]?.getCanonical(rawItems[i], today);
if (item)
canonicalItems.push({
store,
...item,
2023-05-25 13:54:28 +02:00
});
2023-05-24 20:02:45 +02:00
}
return canonicalItems;
}
2023-05-17 16:17:45 +02:00
function mergePriceHistory(oldItems, items) {
if (oldItems == null) return items;
const lookup = {};
2023-05-17 16:17:45 +02:00
for (oldItem of oldItems) {
lookup[oldItem.store + oldItem.id] = oldItem;
2023-05-17 16:17:45 +02:00
}
for (item of items) {
let oldItem = lookup[item.store + item.id];
delete lookup[item.store + item.id];
2023-05-17 16:17:45 +02:00
let currPrice = item.priceHistory[0];
if (oldItem) {
if (oldItem.priceHistory[0].price == currPrice.price) {
item.priceHistory = oldItem.priceHistory;
continue;
}
for (oldPrice of oldItem.priceHistory) {
item.priceHistory.push(oldPrice);
}
}
}
console.log(`${Object.keys(lookup).length} not in latest list.`);
for (key of Object.keys(lookup)) {
items.push(lookup[key]);
}
sortItems(items);
console.log(`Items: ${items.length}`);
2023-05-17 16:17:45 +02:00
return items;
}
function sortItems(items) {
items.sort((a, b) => {
if (a.store < b.store) {
return -1;
} else if (a.store > b.store) {
return 1;
}
if (a.name < b.name) {
return -1;
} else if (a.name > b.name) {
return 1;
}
return 0;
});
}
2023-05-30 10:34:25 +02:00
// Keep this in sync with utils.js:decompress
function compress(items) {
const compressed = {
stores: STORE_KEYS,
n: items.length,
data: [],
};
2023-05-30 10:34:25 +02:00
const data = compressed.data;
for (item of items) {
data.push(STORE_KEYS.indexOf(item.store));
data.push(item.id);
data.push(item.name);
data.push(item.priceHistory.length);
for (price of item.priceHistory) {
data.push(price.date.replaceAll("-", ""));
data.push(price.price);
}
data.push(item.unit);
data.push(item.quantity);
data.push(item.isWeighted ? 1 : 0);
data.push(item.bio ? 1 : 0);
data.push(item.url?.replace(stores[item.store].urlBase, ""));
2023-05-30 10:34:25 +02:00
}
return compressed;
}
exports.compress = compress;
2023-05-25 13:54:28 +02:00
/// Given a directory of raw data of the form `$store-$date.json`, constructs
2023-05-17 16:17:45 +02:00
/// a canonical list of all products and their historical price data.
exports.replay = function (rawDataDir) {
2023-05-17 16:17:45 +02:00
const today = currentDate();
const files = fs
.readdirSync(rawDataDir)
.filter((file) => file.indexOf("canonical") == -1 && STORE_KEYS.some((store) => file.indexOf(`${store}-`) == 0));
2023-05-17 16:17:45 +02:00
const dateSort = (a, b) => {
const dateA = new Date(a.match(/\d{4}-\d{2}-\d{2}/)[0]);
const dateB = new Date(b.match(/\d{4}-\d{2}-\d{2}/)[0]);
return dateA - dateB;
};
2023-05-24 20:02:45 +02:00
const getFilteredFilesFor = (store) =>
files
.filter((file) => file.indexOf(`${store}-`) == 0)
.sort(dateSort)
.map((file) => rawDataDir + "/" + file);
2023-05-25 13:54:28 +02:00
const storeFiles = {};
const canonicalFiles = {};
2023-05-24 20:02:45 +02:00
for (const store of STORE_KEYS) {
2023-05-25 13:54:28 +02:00
storeFiles[store] = getFilteredFilesFor(store);
canonicalFiles[store] = storeFiles[store].map(file => getCanonicalFor(store, readJSON(file, true), file.match(/\d{4}-\d{2}-\d{2}/)[0]));
2023-05-25 13:54:28 +02:00
canonicalFiles[store].reverse();
}
2023-05-17 16:17:45 +02:00
const allFilesCanonical = [];
const len = Math.max(...Object.values(canonicalFiles).map((filesByStore) => filesByStore.length));
2023-05-18 20:30:26 +02:00
for (let i = 0; i < len; i++) {
const canonical = [];
Object.values(canonicalFiles).forEach((filesByStore) => {
2023-05-25 13:54:28 +02:00
const file = filesByStore.pop();
if (file) canonical.push(...file);
});
2023-05-26 00:34:26 +02:00
allFilesCanonical.push(canonical);
2023-05-17 16:17:45 +02:00
}
if (allFilesCanonical.length == 0) return null;
if (allFilesCanonical.length == 1) return allFilesCanonical[0];
let prev = allFilesCanonical[0];
let curr = null;
for (let i = 1; i < allFilesCanonical.length; i++) {
curr = allFilesCanonical[i];
mergePriceHistory(prev, curr);
prev = curr;
}
return curr;
};
2023-05-17 16:17:45 +02:00
exports.updateData = async function (dataDir, done) {
2023-05-17 16:17:45 +02:00
const today = currentDate();
console.log("Fetching data for date: " + today);
const storeFetchPromises = [];
for (const store of STORE_KEYS) {
2023-05-25 13:54:28 +02:00
storeFetchPromises.push(new Promise(async (resolve) => {
const start = performance.now();
try {
const storeItems = await stores[store].fetchData();
writeJSON(`${dataDir}/${store}-${today}.json`, storeItems, true);
2023-05-25 13:54:28 +02:00
const storeItemsCanonical = getCanonicalFor(store, storeItems, today);
console.log(`Fetched ${store.toUpperCase()} data, took ${(performance.now() - start) / 1000} seconds`);
resolve(storeItemsCanonical)
} catch (e) {
console.error(`Error while fetching data from ${store}, continuing after ${(performance.now() - start) / 1000} seconds...`, e);
resolve([])
}
}));
}
const items = [].concat(...(await Promise.all(storeFetchPromises)));
2023-05-25 12:28:12 +02:00
if (fs.existsSync(`${dataDir}/latest-canonical.json.gz`)) {
const oldItems = readJSON(`${dataDir}/latest-canonical.json`, true);
2023-05-17 16:17:45 +02:00
mergePriceHistory(oldItems, items);
console.log("Merged price history");
}
sortItems(items);
writeJSON(`${dataDir}/latest-canonical.json`, items, true);
2023-05-17 16:17:45 +02:00
if (done) done(items);
2023-05-17 16:17:45 +02:00
return items;
};