websegura/crawler/sites-parser.js

/**
 * Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada
 * gloal, de comunidades y de provincias.
 */
const fs = require("fs");
const glob = require("fast-glob");

const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH || 2; // can be increased when we have many sites to scan
const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000;
const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS || 400;

/**
 * Obtiene las rutas a los ficheros global, de comunidades y provincias.
 */
function getAllFiles() {
  const files = glob.sync("_data/{comunidades,provincias}/*.json");
  files.push("_data/general.json");
  return files;
}

function pathToObject(path) {
  return JSON.parse(fs.readFileSync(path));
}

/**
 * Obtiene todas las urls especificadas en los ficheros global, de comunidades y provincias.
 */
function getAllUrls() {
  const files = getAllFiles();
  return files.flatMap((file) =>
    pathToObject(file).webs.map((x) => beautify(x.url))
  );
}

/**
 * Devuelve la URL de las webs que no se han refrescado
 * en los últimos MAX_TIME_TO_REFRESH_MILLIS.
 * Para evitar saturar el API de Mozilla se devuelve MAX_RESULTS como máximo, ordenados al azar.
 *
 * For the sake of simplicity, this function is sync for now
 */
async function parse(limit = MAX_RESULTS) {
  const all = getAllUrls()
    .filter(outdated)
    .sort((a, b) => {
      // XXX this parsing code is kind of duplicated in the outdated function
      //     we should find a way to avoid parsing the same file multiple times
      const aFileName = a.replace(/\./g, "!") + ".json";
      const aPath = `_data/results/${aFileName}`;
      const aSiteInfo = JSON.parse(fs.readFileSync(aPath));
      const aStartTime = new Date(aSiteInfo.start_time).valueOf();

      const bFileName = b.replace(/\./g, "!") + ".json";
      const bPath = `_data/results/${bFileName}`;
      const bSiteInfo = JSON.parse(fs.readFileSync(bPath));
      const bStartTime = new Date(bSiteInfo.start_time).valueOf();

      // sorting oldest pages first guarantee that they will have a chance to be processed
      return aStartTime < bStartTime;
    })
    .slice(0, limit);

  console.log(`Outdated sites found = ${all.length} (limit = ${limit})`);
  return all;
}

// Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")
function beautify(url) {
  url = url.replace("http://", "");
  url = url.replace("https://", "");
  return new URL(`https://${url}`).hostname;
}

function outdated(site) {
  const fileName = site.replace(/\./g, "!") + ".json";
  const path = `_data/results/${fileName}`;

  try {
    // XXX remove these console logs, they are only here to help us debug an issue
    console.log(`Check ${path}`);
    const siteInfo = JSON.parse(fs.readFileSync(path));
    const recent =
      new Date(siteInfo.start_time).valueOf() >
      Date.now() - MAX_TIME_TO_REFRESH_MILLIS;
    console.log('\tstate = ' + siteInfo.state + ' is recent = ' + recent + ' ' + siteInfo.start_time);
    if (siteInfo.state === "FINISHED" && recent) {
      console.log('\tNo need to analyze it');
      return false;
    }
  } catch (err) {
    console.log('\tERROR', err);
    // file not found (err.code === ENOENT) or an unexpected error, refresh the analysis
  }

  return true;
}

module.exports = {
  parse: parse,
  beautify: beautify,
  getAllUrls: getAllUrls,
  getAllFiles: getAllFiles,
  pathToObject: pathToObject,
};
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`/**`
			`* Fichero con utilidades comunes al crawling, para procesar los ficheros de entrada`
			`* gloal, de comunidades y de provincias.`
			`*/`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`const fs = require("fs");`
			`const glob = require("fast-glob");`
chore: improve crawling code 2021-02-04 21:35:05 +01:00
chore: configurable crawler settings (#148) 2021-02-22 22:00:45 +01:00			`const MAX_DAYS_TO_REFRESH = process.env.CRAWLER_MAX_DAYS_TO_REFRESH \|\| 2; // can be increased when we have many sites to scan`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`const MAX_TIME_TO_REFRESH_MILLIS = MAX_DAYS_TO_REFRESH * 24 * 60 * 60 * 1000;`
debug logs 2021-10-11 15:24:17 +02:00			`const MAX_RESULTS = process.env.CRAWLER_MAX_RESULTS \|\| 400;`
chore: improve crawling code 2021-02-04 21:35:05 +01:00
			`/**`
export helper functions 2021-03-05 19:07:17 +01:00			`* Obtiene las rutas a los ficheros global, de comunidades y provincias.`
Add duplicates test with ghactions 2021-02-08 13:49:53 +01:00			`*/`
export helper functions 2021-03-05 19:07:17 +01:00			`function getAllFiles() {`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`const files = glob.sync("_data/{comunidades,provincias}/*.json");`
			`files.push("_data/general.json");`
export helper functions 2021-03-05 19:07:17 +01:00			`return files;`
			`}`

			`function pathToObject(path) {`
			`return JSON.parse(fs.readFileSync(path));`
			`}`

			`/**`
			`* Obtiene todas las urls especificadas en los ficheros global, de comunidades y provincias.`
			`*/`
			`function getAllUrls() {`
			`const files = getAllFiles();`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`return files.flatMap((file) =>`
export helper functions 2021-03-05 19:07:17 +01:00			`pathToObject(file).webs.map((x) => beautify(x.url))`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`);`
Add duplicates test with ghactions 2021-02-08 13:49:53 +01:00			`}`

			`/**`
			`* Devuelve la URL de las webs que no se han refrescado`
			`* en los últimos MAX_TIME_TO_REFRESH_MILLIS.`
random list of outdates pages 2021-10-11 15:14:53 +02:00			`* Para evitar saturar el API de Mozilla se devuelve MAX_RESULTS como máximo, ordenados al azar.`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`*`
			`* For the sake of simplicity, this function is sync for now`
			`*/`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`async function parse(limit = MAX_RESULTS) {`
chore: sort sites by start time 2021-10-15 21:33:32 +02:00			`const all = getAllUrls()`
			`.filter(outdated)`
			`.sort((a, b) => {`
			`// XXX this parsing code is kind of duplicated in the outdated function`
			`// we should find a way to avoid parsing the same file multiple times`
			`const aFileName = a.replace(/\./g, "!") + ".json";`
			const aPath = `_data/results/${aFileName}`;
			`const aSiteInfo = JSON.parse(fs.readFileSync(aPath));`
			`const aStartTime = new Date(aSiteInfo.start_time).valueOf();`

			`const bFileName = b.replace(/\./g, "!") + ".json";`
			const bPath = `_data/results/${bFileName}`;
			`const bSiteInfo = JSON.parse(fs.readFileSync(bPath));`
			`const bStartTime = new Date(bSiteInfo.start_time).valueOf();`

			`// sorting oldest pages first guarantee that they will have a chance to be processed`
			`return aStartTime < bStartTime;`
			`})`
			`.slice(0, limit);`

typo 2021-10-11 15:37:11 +02:00			console.log(`Outdated sites found = ${all.length} (limit = ${limit})`);
chore: sort sites by start time 2021-10-15 21:33:32 +02:00			`return all;`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`}`

			`// Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")`
			`function beautify(url) {`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`url = url.replace("http://", "");`
			`url = url.replace("https://", "");`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			return new URL(`https://${url}`).hostname;
			`}`

			`function outdated(site) {`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`const fileName = site.replace(/\./g, "!") + ".json";`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			const path = `_data/results/${fileName}`;

			`try {`
chore: sort sites by start time 2021-10-15 21:33:32 +02:00			`// XXX remove these console logs, they are only here to help us debug an issue`
debug logs 2021-10-11 15:24:17 +02:00			console.log(`Check ${path}`);
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`const siteInfo = JSON.parse(fs.readFileSync(path));`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`const recent =`
			`new Date(siteInfo.start_time).valueOf() >`
			`Date.now() - MAX_TIME_TO_REFRESH_MILLIS;`
debug info 2021-10-11 15:33:09 +02:00			`console.log('\tstate = ' + siteInfo.state + ' is recent = ' + recent + ' ' + siteInfo.start_time);`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`if (siteInfo.state === "FINISHED" && recent) {`
debug logs 2021-10-11 15:24:17 +02:00			`console.log('\tNo need to analyze it');`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`return false;`
			`}`
			`} catch (err) {`
debug logs 2021-10-11 15:24:17 +02:00			`console.log('\tERROR', err);`
chore: improve crawling code 2021-02-04 21:35:05 +01:00			`// file not found (err.code === ENOENT) or an unexpected error, refresh the analysis`
			`}`

			`return true;`
			`}`

			`module.exports = {`
Add duplicates test with ghactions 2021-02-08 13:49:53 +01:00			`parse: parse,`
			`beautify: beautify,`
			`getAllUrls: getAllUrls,`
export helper functions 2021-03-05 19:07:17 +01:00			`getAllFiles: getAllFiles,`
			`pathToObject: pathToObject,`
Days to refresh extracted to new constant. Prettier.js fixes. 2021-02-21 14:06:59 +01:00			`};`