2021-02-04 21:35:05 +01:00
|
|
|
/**
|
|
|
|
* Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada
|
|
|
|
* gloal, de comunidades y de provincias.
|
|
|
|
*/
|
2021-02-21 14:06:59 +01:00
|
|
|
const fs = require("fs");
|
|
|
|
const glob = require("fast-glob");
|
2021-02-04 21:35:05 +01:00
|
|
|
|
2021-02-22 22:00:45 +01:00
|
|
|
const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH || 2; // can be increased when we have many sites to scan
|
2021-02-21 14:06:59 +01:00
|
|
|
const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000;
|
2021-10-11 15:24:17 +02:00
|
|
|
const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS || 400;
|
2021-02-04 21:35:05 +01:00
|
|
|
|
|
|
|
/**
|
2021-03-05 19:07:17 +01:00
|
|
|
* Obtiene las rutas a los ficheros global, de comunidades y provincias.
|
2021-02-08 13:49:53 +01:00
|
|
|
*/
|
2021-03-05 19:07:17 +01:00
|
|
|
function getAllFiles() {
|
2021-02-21 14:06:59 +01:00
|
|
|
const files = glob.sync("_data/{comunidades,provincias}/*.json");
|
|
|
|
files.push("_data/general.json");
|
2021-03-05 19:07:17 +01:00
|
|
|
return files;
|
|
|
|
}
|
|
|
|
|
|
|
|
function pathToObject(path) {
|
|
|
|
return JSON.parse(fs.readFileSync(path));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Obtiene todas las urls especificadas en los ficheros global, de comunidades y provincias.
|
|
|
|
*/
|
|
|
|
function getAllUrls() {
|
|
|
|
const files = getAllFiles();
|
2021-02-21 14:06:59 +01:00
|
|
|
return files.flatMap((file) =>
|
2021-03-05 19:07:17 +01:00
|
|
|
pathToObject(file).webs.map((x) => beautify(x.url))
|
2021-02-21 14:06:59 +01:00
|
|
|
);
|
2021-02-08 13:49:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Devuelve la URL de las webs que no se han refrescado
|
|
|
|
* en los últimos MAX_TIME_TO_REFRESH_MILLIS.
|
2021-10-11 15:14:53 +02:00
|
|
|
* Para evitar saturar el API de Mozilla se devuelve MAX_RESULTS como máximo, ordenados al azar.
|
2021-02-04 21:35:05 +01:00
|
|
|
*
|
|
|
|
* For the sake of simplicity, this function is sync for now
|
|
|
|
*/
|
2021-02-21 14:06:59 +01:00
|
|
|
async function parse(limit = MAX_RESULTS) {
|
2021-10-15 21:33:32 +02:00
|
|
|
const all = getAllUrls()
|
|
|
|
.filter(outdated)
|
|
|
|
.sort((a, b) => {
|
|
|
|
// XXX this parsing code is kind of duplicated in the outdated function
|
|
|
|
// we should find a way to avoid parsing the same file multiple times
|
|
|
|
const aFileName = a.replace(/\./g, "!") + ".json";
|
|
|
|
const aPath = `_data/results/${aFileName}`;
|
|
|
|
const aSiteInfo = JSON.parse(fs.readFileSync(aPath));
|
|
|
|
const aStartTime = new Date(aSiteInfo.start_time).valueOf();
|
|
|
|
|
|
|
|
const bFileName = b.replace(/\./g, "!") + ".json";
|
|
|
|
const bPath = `_data/results/${bFileName}`;
|
|
|
|
const bSiteInfo = JSON.parse(fs.readFileSync(bPath));
|
|
|
|
const bStartTime = new Date(bSiteInfo.start_time).valueOf();
|
|
|
|
|
|
|
|
// sorting oldest pages first guarantee that they will have a chance to be processed
|
|
|
|
return aStartTime < bStartTime;
|
|
|
|
})
|
|
|
|
.slice(0, limit);
|
|
|
|
|
2021-10-11 15:37:11 +02:00
|
|
|
console.log(`Outdated sites found = ${all.length} (limit = ${limit})`);
|
2021-10-15 21:33:32 +02:00
|
|
|
return all;
|
2021-02-04 21:35:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")
|
|
|
|
function beautify(url) {
|
2021-02-21 14:06:59 +01:00
|
|
|
url = url.replace("http://", "");
|
|
|
|
url = url.replace("https://", "");
|
2021-02-04 21:35:05 +01:00
|
|
|
return new URL(`https://${url}`).hostname;
|
|
|
|
}
|
|
|
|
|
|
|
|
function outdated(site) {
|
2021-02-21 14:06:59 +01:00
|
|
|
const fileName = site.replace(/\./g, "!") + ".json";
|
2021-02-04 21:35:05 +01:00
|
|
|
const path = `_data/results/${fileName}`;
|
|
|
|
|
|
|
|
try {
|
2021-10-15 21:33:32 +02:00
|
|
|
// XXX remove these console logs, they are only here to help us debug an issue
|
2021-10-11 15:24:17 +02:00
|
|
|
console.log(`Check ${path}`);
|
2021-02-04 21:35:05 +01:00
|
|
|
const siteInfo = JSON.parse(fs.readFileSync(path));
|
2021-02-21 14:06:59 +01:00
|
|
|
const recent =
|
|
|
|
new Date(siteInfo.start_time).valueOf() >
|
|
|
|
Date.now() - MAX_TIME_TO_REFRESH_MILLIS;
|
2021-10-11 15:33:09 +02:00
|
|
|
console.log('\tstate = ' + siteInfo.state + ' is recent = ' + recent + ' ' + siteInfo.start_time);
|
2021-02-21 14:06:59 +01:00
|
|
|
if (siteInfo.state === "FINISHED" && recent) {
|
2021-10-11 15:24:17 +02:00
|
|
|
console.log('\tNo need to analyze it');
|
2021-02-04 21:35:05 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} catch (err) {
|
2021-10-11 15:24:17 +02:00
|
|
|
console.log('\tERROR', err);
|
2021-02-04 21:35:05 +01:00
|
|
|
// file not found (err.code === ENOENT) or an unexpected error, refresh the analysis
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = {
|
2021-02-08 13:49:53 +01:00
|
|
|
parse: parse,
|
|
|
|
beautify: beautify,
|
|
|
|
getAllUrls: getAllUrls,
|
2021-03-05 19:07:17 +01:00
|
|
|
getAllFiles: getAllFiles,
|
|
|
|
pathToObject: pathToObject,
|
2021-02-21 14:06:59 +01:00
|
|
|
};
|