subvenciones/scripts/concessions.sh

111 lines
3.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -eE
ROWS=50000
PAGES=0
ND=$(date +%s%N | cut -b1-13)
USER_AGENT='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
F_CONCESSIONS='concessions.csv'
F_LEGAL='legal.csv'
F_TAR_GZ='concessions.tar.gz'
F_COOKIES='cookies.txt'
APP=(wget jq csvgrep tar gzip)
function getCookies() {
echo "Getting cookies..."
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --save-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" 'https://www.infosubvenciones.es/' &> /dev/null
}
function fetchConcessions() {
local outputFile="page_$(printf %03d "$1").json"
echo "Downloading page $1 of $2..."
if [ -z ${COOKIE+x} ]; then
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --load-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"
else
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --header "Cookie: $COOKIE" --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"
fi
}
export -f fetchConcessions
function getNumPages() {
local page=1
local outputFile="page_$(printf %03d $page).json"
echo "Getting number of pages..."
fetchConcessions $page $PAGES
PAGES=$(jq '.total' "$outputFile")
local records=$(jq '.records' "$outputFile")
echo "Total pages: $PAGES"
echo "Total records: $records"
if [[ $PAGES -le 0 ]]; then
errorHandler "Cannot get number of pages" "$LINENO"
fi
}
function getAllConcessions() {
echo "Downloading all concessions..."
seq 2 "$PAGES" | xargs -I page -n 1 -P 5 bash -c "fetchConcessions page $PAGES"
}
function convertJson2Csv() {
echo "Converting JSON to CSV..."
for file in *.json; do jq -r '.rows[] | [.[]] | @csv' "$file"; done > $F_CONCESSIONS
}
function getAllLegal() {
echo "Extracting legal concessions..."
csvgrep --no-header-row \
--columns 10 \
--regex '^[A-HJP-SUV]\d{7}[0-9A-J] ' \
$F_CONCESSIONS > $F_LEGAL
}
function compressData() {
echo "Compressing data..."
tar -czf $F_TAR_GZ $F_CONCESSIONS $F_LEGAL
}
function cleanTempFiles() {
echo "Cleaning temporary files..."
rm -f ./*.json $F_COOKIES
}
function errorHandler() {
echo "Error: ($1) occurred on line $2"
cleanTempFiles
exit 1
}
trap 'errorHandler $? $LINENO' ERR
trap "echo -e '\nTerminated by Ctrl+c'; cleanTempFiles; exit" INT
for element in "${APP[@]}"; do
[ -z "$(whereis -b "$element" | awk {'print$2'})" ] && errorHandler "$element - Maybe it is not installed on the system. Sorry but I can't continue" "$LINENO"
done
if [ -z ${COOKIE+x} ]; then
echo "Manual cookie is unset";
getCookies;
else
echo "Manual cookie is set. Skipping get cookie step";
fi
getNumPages
getAllConcessions
convertJson2Csv
getAllLegal
compressData
cleanTempFiles