/** * Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada * gloal, de comunidades y de provincias. */ const fs = require("fs"); const glob = require("fast-glob"); const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH || 2; // can be increased when we have many sites to scan const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000; const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS || 400; /** * Obtiene las rutas a los ficheros global, de comunidades y provincias. */ function getAllFiles() { const files = glob.sync("_data/{comunidades,provincias}/*.json"); files.push("_data/general.json"); return files; } function pathToObject(path) { return JSON.parse(fs.readFileSync(path)); } /** * Obtiene todas las urls especificadas en los ficheros global, de comunidades y provincias. */ function getAllUrls() { const files = getAllFiles(); return files.flatMap((file) => pathToObject(file).webs.map((x) => beautify(x.url)) ); } /** * Devuelve la URL de las webs que no se han refrescado * en los últimos `MAX_TIME_TO_REFRESH_MILLIS`. * Para evitar saturar el API de Mozilla se devuelve `MAX_RESULTS` como máximo. * * For the sake of simplicity, this function is sync for now */ async function parse(limit = MAX_RESULTS) { const allUrls = getAllUrls() const outdatedUrls = allUrls.filter(outdated) const all = outdatedUrls .sort((a, b) => { const aInfo = siteInfo(a); const bInfo = siteInfo(b); // Sorting oldest pages first guarantee they have a chance // to be processed in the next run. if (aInfo && bInfo) { const aStartTime = new Date(aInfo.start_time).valueOf(); const bStartTime = new Date(bInfo.start_time).valueOf(); return aStartTime - bStartTime; } else if (aInfo) { return 1; } else if (bInfo) { return -1; } else { return 0; } }) .slice(0, limit); console.log(`Total sites: ${allUrls.length}. Outdated sites: ${all.length}. Reported sites: ${all.length} (limit = ${limit})`); return all; } // Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://") function beautify(url) { url = url.replace("http://", ""); url = url.replace("https://", ""); return new URL(`https://${url}`).hostname; } function filePath(site) { const fileName = site.replace(/\./g, "!") + ".json"; return`_data/results/${fileName}`; } const siteInfoCache = {} function siteInfo(site) { if (siteInfoCache[site] !== undefined) { return siteInfoCache[site] } try { const path = filePath(site); const result = JSON.parse(fs.readFileSync(path)); siteInfoCache[site] = result return result } catch (err) { console.log('\tWARN', err.message); // file not found (err.code === ENOENT) or an unexpected error, refresh the analysis } } function outdated(site) { // XXX remove these console logs, they are only here to help us debug an issue console.log(`Check ${site}`); const info = siteInfo(site); if (info) { const recent = new Date(info.start_time).valueOf() > Date.now() - MAX_TIME_TO_REFRESH_MILLIS; console.log('\tstate = ' + info.state + ' ' + info.start_time); if (info.state === "FINISHED" && recent) { console.log('\tNo need to analyze it'); return false; } } return true; } module.exports = { parse: parse, beautify: beautify, getAllUrls: getAllUrls, getAllFiles: getAllFiles, pathToObject: pathToObject, };