From dcb0021cdc73611e0d7e87f82601a2c18a42557f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guido=20Garc=C3=ADa?= Date: Mon, 8 Feb 2021 22:33:44 +0100 Subject: [PATCH] limit max results --- crawler/sites-parser.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crawler/sites-parser.js b/crawler/sites-parser.js index 888b08ec9b7..4eda11f44f4 100644 --- a/crawler/sites-parser.js +++ b/crawler/sites-parser.js @@ -6,6 +6,7 @@ const fs = require('fs'); const glob = require('fast-glob'); const MAX_TIME_TO_REFRESH_MILLIS = 3 * 24 * 60 * 60 * 1000; // 3 days, can be increased when we have many sites to scan +const MAX_RESULTS = 100; /** * Obtiene los ficheros global, de comunidades y provincias. @@ -20,11 +21,14 @@ function getAllUrls() { /** * Devuelve la URL de las webs que no se han refrescado * en los últimos MAX_TIME_TO_REFRESH_MILLIS. + * Devuelve MAX_RESULTS como máximo, para evitar saturar el API de Mozilla. * * For the sake of simplicity, this function is sync for now */ -async function parse() { - return getAllUrls().filter(outdated); +async function parse(limit=MAX_RESULTS) { + // XXX applying the limit during the filtering phase would + // be more efficient, but js sucks sometimes + return getAllUrls().filter(outdated).slice(0, limit); } // Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")