websegura/crawler/sites-parser.js

127 lines
3.5 KiB
JavaScript
Raw Normal View History

2021-02-04 21:35:05 +01:00
/**
* Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada
* gloal, de comunidades y de provincias.
*/
const fs = require("fs");
const glob = require("fast-glob");
2021-02-04 21:35:05 +01:00
const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH || 2; // can be increased when we have many sites to scan
const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000;
2021-10-11 15:24:17 +02:00
const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS || 400;
2021-02-04 21:35:05 +01:00
/**
2021-03-05 19:07:17 +01:00
* Obtiene las rutas a los ficheros global, de comunidades y provincias.
2021-02-08 13:49:53 +01:00
*/
2021-03-05 19:07:17 +01:00
function getAllFiles() {
const files = glob.sync("_data/{comunidades,provincias}/*.json");
files.push("_data/general.json");
2021-03-05 19:07:17 +01:00
return files;
}
function pathToObject(path) {
return JSON.parse(fs.readFileSync(path));
}
/**
* Obtiene todas las urls especificadas en los ficheros global, de comunidades y provincias.
*/
function getAllUrls() {
const files = getAllFiles();
return files.flatMap((file) =>
2021-03-05 19:07:17 +01:00
pathToObject(file).webs.map((x) => beautify(x.url))
);
2021-02-08 13:49:53 +01:00
}
/**
* Devuelve la URL de las webs que no se han refrescado
* en los últimos `MAX_TIME_TO_REFRESH_MILLIS`.
* Para evitar saturar el API de Mozilla se devuelve `MAX_RESULTS` como máximo.
2021-02-04 21:35:05 +01:00
*
* For the sake of simplicity, this function is sync for now
*/
async function parse(limit = MAX_RESULTS) {
const allUrls = getAllUrls()
const outdatedUrls = allUrls.filter(outdated)
const all = outdatedUrls
2021-10-15 21:33:32 +02:00
.sort((a, b) => {
2021-10-15 23:11:00 +02:00
const aInfo = siteInfo(a);
const bInfo = siteInfo(b);
2021-10-15 21:33:32 +02:00
2021-10-15 23:11:00 +02:00
// Sorting oldest pages first guarantee they have a chance
// to be processed in the next run.
if (aInfo && bInfo) {
const aStartTime = new Date(aInfo.start_time).valueOf();
const bStartTime = new Date(bInfo.start_time).valueOf();
return aStartTime - bStartTime;
} else if (aInfo) {
return 1;
} else if (bInfo) {
return -1;
} else {
return 0;
}
2021-10-15 21:33:32 +02:00
})
.slice(0, limit);
console.log(`Total sites: ${allUrls.length}. Outdated sites: ${all.length}. Reported sites: ${all.length} (limit = ${limit})`);
2021-10-15 21:33:32 +02:00
return all;
2021-02-04 21:35:05 +01:00
}
// Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")
function beautify(url) {
url = url.replace("http://", "");
url = url.replace("https://", "");
2021-02-04 21:35:05 +01:00
return new URL(`https://${url}`).hostname;
}
2021-10-15 23:11:00 +02:00
function filePath(site) {
const fileName = site.replace(/\./g, "!") + ".json";
2021-10-15 23:11:00 +02:00
return`_data/results/${fileName}`;
}
2021-02-04 21:35:05 +01:00
const siteInfoCache = {}
2021-10-15 23:11:00 +02:00
function siteInfo(site) {
if (siteInfoCache[site] !== undefined) {
return siteInfoCache[site]
}
2021-02-04 21:35:05 +01:00
try {
2021-10-15 23:11:00 +02:00
const path = filePath(site);
const result = JSON.parse(fs.readFileSync(path));
siteInfoCache[site] = result
return result
2021-10-15 23:11:00 +02:00
} catch (err) {
console.log('\tWARN', err.message);
// file not found (err.code === ENOENT) or an unexpected error, refresh the analysis
}
}
function outdated(site) {
// XXX remove these console logs, they are only here to help us debug an issue
console.log(`Check ${site}`);
const info = siteInfo(site);
if (info) {
const recent =
2021-10-15 23:11:00 +02:00
new Date(info.start_time).valueOf() >
Date.now() - MAX_TIME_TO_REFRESH_MILLIS;
2021-10-15 23:11:00 +02:00
console.log('\tstate = ' + info.state + ' ' + info.start_time);
if (info.state === "FINISHED" && recent) {
2021-10-11 15:24:17 +02:00
console.log('\tNo need to analyze it');
2021-02-04 21:35:05 +01:00
return false;
}
}
return true;
}
module.exports = {
2021-02-08 13:49:53 +01:00
parse: parse,
beautify: beautify,
getAllUrls: getAllUrls,
2021-03-05 19:07:17 +01:00
getAllFiles: getAllFiles,
pathToObject: pathToObject,
};