111 lines
3.0 KiB
Bash
Executable File
111 lines
3.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -eE
|
|
|
|
ROWS=50000
|
|
PAGES=0
|
|
ND=$(date +%s%N | cut -b1-13)
|
|
USER_AGENT='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
|
|
F_CONCESSIONS='concessions.csv'
|
|
F_LEGAL='legal.csv'
|
|
F_TAR_GZ='concessions.tar.gz'
|
|
F_COOKIES='cookies.txt'
|
|
APP=(wget jq csvgrep tar gzip)
|
|
|
|
function getCookies() {
|
|
echo "Getting cookies..."
|
|
|
|
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --save-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" 'https://www.infosubvenciones.es/' &> /dev/null
|
|
}
|
|
|
|
function fetchConcessions() {
|
|
local outputFile="page_$(printf %03d "$1").json"
|
|
|
|
echo "Downloading page $1 of $2..."
|
|
|
|
if [ -z ${COOKIE+x} ]; then
|
|
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --load-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"
|
|
else
|
|
wget -q --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --header "Cookie: $COOKIE" --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"
|
|
fi
|
|
}
|
|
export -f fetchConcessions
|
|
|
|
function getNumPages() {
|
|
local page=1
|
|
local outputFile="page_$(printf %03d $page).json"
|
|
|
|
echo "Getting number of pages..."
|
|
|
|
fetchConcessions $page $PAGES
|
|
|
|
PAGES=$(jq '.total' "$outputFile")
|
|
local records=$(jq '.records' "$outputFile")
|
|
|
|
echo "Total pages: $PAGES"
|
|
echo "Total records: $records"
|
|
|
|
if [[ $PAGES -le 0 ]]; then
|
|
errorHandler "Cannot get number of pages" "$LINENO"
|
|
fi
|
|
}
|
|
|
|
function getAllConcessions() {
|
|
echo "Downloading all concessions..."
|
|
|
|
seq 2 "$PAGES" | xargs -I page -n 1 -P 5 bash -c "fetchConcessions page $PAGES"
|
|
}
|
|
|
|
function convertJson2Csv() {
|
|
echo "Converting JSON to CSV..."
|
|
|
|
for file in *.json; do jq -r '.rows[] | [.[]] | @csv' "$file"; done > $F_CONCESSIONS
|
|
}
|
|
|
|
function getAllLegal() {
|
|
echo "Extracting legal concessions..."
|
|
|
|
csvgrep --no-header-row \
|
|
--columns 10 \
|
|
--regex '^[A-HJP-SUV]\d{7}[0-9A-J] ' \
|
|
$F_CONCESSIONS > $F_LEGAL
|
|
}
|
|
|
|
function compressData() {
|
|
echo "Compressing data..."
|
|
|
|
tar -czf $F_TAR_GZ $F_CONCESSIONS $F_LEGAL
|
|
}
|
|
|
|
function cleanTempFiles() {
|
|
echo "Cleaning temporary files..."
|
|
|
|
rm -f ./*.json $F_COOKIES
|
|
}
|
|
|
|
function errorHandler() {
|
|
echo "Error: ($1) occurred on line $2"
|
|
cleanTempFiles
|
|
exit 1
|
|
}
|
|
|
|
trap 'errorHandler $? $LINENO' ERR
|
|
trap "echo -e '\nTerminated by Ctrl+c'; cleanTempFiles; exit" INT
|
|
|
|
for element in "${APP[@]}"; do
|
|
[ -z "$(whereis -b "$element" | awk {'print$2'})" ] && errorHandler "$element - Maybe it is not installed on the system. Sorry but I can't continue" "$LINENO"
|
|
done
|
|
|
|
if [ -z ${COOKIE+x} ]; then
|
|
echo "Manual cookie is unset";
|
|
getCookies;
|
|
else
|
|
echo "Manual cookie is set. Skipping get cookie step";
|
|
fi
|
|
|
|
getNumPages
|
|
getAllConcessions
|
|
convertJson2Csv
|
|
getAllLegal
|
|
compressData
|
|
cleanTempFiles
|