mirror of
https://github.com/badlogic/heissepreise.git
synced 2024-07-04 22:35:48 +02:00
dd938ba7cd
Penny now has a subcategory that leads back to the all categories page. This triggered an infinite recursion which eventually goes OOM boom.
108 lines
3.6 KiB
JavaScript
108 lines
3.6 KiB
JavaScript
const axios = require("axios");
|
|
const utils = require("./utils");
|
|
const HTMLParser = require("node-html-parser");
|
|
const MAXITEMS = 10000;
|
|
|
|
const units = {
|
|
bd: { unit: "stk", factor: 1 },
|
|
gr: { unit: "g", factor: 1 },
|
|
lt: { unit: "ml", factor: 1000 },
|
|
pk: { unit: "stk", factor: 1 },
|
|
pa: { unit: "stk", factor: 1 },
|
|
rl: { unit: "stk", factor: 1 },
|
|
tb: { unit: "stk", factor: 1 },
|
|
};
|
|
|
|
exports.getCanonical = function (item, today) {
|
|
let quantity = item.amount;
|
|
let unit = item.volumeLabelKey;
|
|
return utils.convertUnit(
|
|
{
|
|
id: item.productId,
|
|
name: item.name,
|
|
// description: "", not available
|
|
price: item.price.regular.value / 100,
|
|
priceHistory: [{ date: today, price: item.price.regular.value / 100 }],
|
|
isWeighted: item.isWeightArticle,
|
|
unit,
|
|
quantity,
|
|
bio: item.name.toLowerCase().includes("bio") && !item.name.toLowerCase().includes("fabio"),
|
|
url: item.sku.replace("-", ""),
|
|
},
|
|
units,
|
|
"penny"
|
|
);
|
|
};
|
|
|
|
exports.fetchData = async function () {
|
|
hits = 100;
|
|
page = 0;
|
|
done = false;
|
|
result = [];
|
|
while (!done) {
|
|
const PENNY_SEARCH = `https://www.penny.at/api/products?page=${page}&pageSize=${hits}`;
|
|
data = (await axios.get(PENNY_SEARCH)).data;
|
|
done = data.count < hits || page * hits > MAXITEMS;
|
|
page++;
|
|
result = result.concat(data.results);
|
|
}
|
|
return result;
|
|
};
|
|
|
|
async function parseCategory(url, parent, result, lookup) {
|
|
const data = (await axios.get(url)).data;
|
|
const dom = HTMLParser.parse(data);
|
|
const categoryTitle = dom.querySelector('[data-test="category-title"]')?.textContent;
|
|
if (url != "https://www.penny.at/kategorie" && categoryTitle.includes("Alle Kategorien")) return;
|
|
const categories = dom.querySelectorAll('[data-test="category-tree-navigation-button"]');
|
|
for (const category of categories) {
|
|
const link = "https://www.penny.at" + category.getAttribute("href");
|
|
if (!category.querySelector(".subtitle-2")) continue;
|
|
const name = (parent ? parent + " -> " : "") + category.querySelector(".subtitle-2").innerText.trim().replace("&", "&");
|
|
if (name.startsWith("Alle Angebote")) continue;
|
|
|
|
if (!lookup.has(link)) {
|
|
lookup.add(link);
|
|
result.push({
|
|
id: name,
|
|
url: link,
|
|
code: null,
|
|
});
|
|
|
|
try {
|
|
await parseCategory(link, name, result, lookup);
|
|
} catch (e) {
|
|
// Ignore, sometimes the server responds with 502. No idea why
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
exports.initializeCategoryMapping = async () => {
|
|
const categories = [];
|
|
await parseCategory("https://www.penny.at/kategorie", null, categories, new Set());
|
|
utils.mergeAndSaveCategories("penny", categories);
|
|
|
|
exports.categoryLookup = {};
|
|
for (const category of categories) {
|
|
exports.categoryLookup[category.id] = category;
|
|
}
|
|
};
|
|
|
|
exports.mapCategory = (rawItem) => {
|
|
const categoryPath = rawItem.parentCategories.filter((path) => path.length > 0 && !path[0].name.includes("ngebot"))[0];
|
|
if (!categoryPath) return null;
|
|
const categoryName = categoryPath.map((path) => path.name).join(" -> ");
|
|
const category = exports.categoryLookup[categoryName];
|
|
if (category) return category.code;
|
|
return null;
|
|
};
|
|
|
|
exports.urlBase = "https://www.penny.at/produkte/";
|
|
|
|
if (require.main == module) {
|
|
(async () => {
|
|
await exports.initializeCategoryMapping();
|
|
})();
|
|
}
|