websegura/crawler/sites-parser.js

67 lines
2.0 KiB
JavaScript
Raw Normal View History

2021-02-04 21:35:05 +01:00
/**
* Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada
* gloal, de comunidades y de provincias.
*/
const fs = require("fs");
const glob = require("fast-glob");
2021-02-04 21:35:05 +01:00
const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH || 2; // can be increased when we have many sites to scan
const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000;
const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS || 200;
2021-02-04 21:35:05 +01:00
/**
2021-02-08 13:49:53 +01:00
* Obtiene los ficheros global, de comunidades y provincias.
*/
function getAllUrls() {
const files = glob.sync("_data/{comunidades,provincias}/*.json");
files.push("_data/general.json");
return files.flatMap((file) =>
JSON.parse(fs.readFileSync(file)).webs.map((x) => beautify(x.url))
);
2021-02-08 13:49:53 +01:00
}
/**
* Devuelve la URL de las webs que no se han refrescado
* en los últimos MAX_TIME_TO_REFRESH_MILLIS.
2021-02-08 22:33:44 +01:00
* Devuelve MAX_RESULTS como máximo, para evitar saturar el API de Mozilla.
2021-02-04 21:35:05 +01:00
*
* For the sake of simplicity, this function is sync for now
*/
async function parse(limit = MAX_RESULTS) {
2021-02-08 22:33:44 +01:00
// XXX applying the limit during the filtering phase would
// be more efficient, but js sucks sometimes
return getAllUrls().filter(outdated).slice(0, limit);
2021-02-04 21:35:05 +01:00
}
// Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")
function beautify(url) {
url = url.replace("http://", "");
url = url.replace("https://", "");
2021-02-04 21:35:05 +01:00
return new URL(`https://${url}`).hostname;
}
function outdated(site) {
const fileName = site.replace(/\./g, "!") + ".json";
2021-02-04 21:35:05 +01:00
const path = `_data/results/${fileName}`;
try {
const siteInfo = JSON.parse(fs.readFileSync(path));
const recent =
new Date(siteInfo.start_time).valueOf() >
Date.now() - MAX_TIME_TO_REFRESH_MILLIS;
if (siteInfo.state === "FINISHED" && recent) {
2021-02-04 21:35:05 +01:00
return false;
}
} catch (err) {
// file not found (err.code === ENOENT) or an unexpected error, refresh the analysis
}
return true;
}
module.exports = {
2021-02-08 13:49:53 +01:00
parse: parse,
beautify: beautify,
getAllUrls: getAllUrls,
};