From dcb0021cdc73611e0d7e87f82601a2c18a42557f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guido=20Garc=C3=ADa?= <guido.garciabernardo@telefonica.com>
Date: Mon, 8 Feb 2021 22:33:44 +0100
Subject: [PATCH] limit max results

---
 crawler/sites-parser.js | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/crawler/sites-parser.js b/crawler/sites-parser.js
index 888b08ec9b7..4eda11f44f4 100644
--- a/crawler/sites-parser.js
+++ b/crawler/sites-parser.js
@@ -6,6 +6,7 @@ const fs = require('fs');
 const glob = require('fast-glob');
 
 const MAX_TIME_TO_REFRESH_MILLIS = 3 * 24 * 60 * 60 * 1000; // 3 days, can be increased when we have many sites to scan
+const MAX_RESULTS = 100;
 
 /**
  * Obtiene los ficheros global, de comunidades y provincias.
@@ -20,11 +21,14 @@ function getAllUrls() {
 /**
  * Devuelve la URL de las webs que no se han refrescado
  * en los últimos MAX_TIME_TO_REFRESH_MILLIS.
+ * Devuelve MAX_RESULTS como máximo, para evitar saturar el API de Mozilla.
  *
  * For the sake of simplicity, this function is sync for now
  */
-async function parse() {
-  return getAllUrls().filter(outdated);
+async function parse(limit=MAX_RESULTS) {
+  // XXX applying the limit during the filtering phase would
+  //     be more efficient, but js sucks sometimes
+  return getAllUrls().filter(outdated).slice(0, limit);
 }
 
 // Mozilla espera un hostname (sin / final y sin indicar protocolo "http[s]://")