From a38da53f8a9e09b2988cf0df2390fec2f79a676c Mon Sep 17 00:00:00 2001 From: Amab Date: Sat, 21 Jan 2023 12:27:26 +0100 Subject: [PATCH] Add concessions download script --- .gitignore | 93 ++++++++++++++- drone.yml | 25 ++++ .../AC_Componentes_Informaticos_SHA256.crt | 39 ++++++ scripts/concessions.sh | 112 ++++++++++++++++++ 4 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 drone.yml create mode 100644 scripts/AC_Componentes_Informaticos_SHA256.crt create mode 100755 scripts/concessions.sh diff --git a/.gitignore b/.gitignore index 5ca0973..3d653d5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,93 @@ -.DS_Store +# Created by https://www.toptal.com/developers/gitignore/api/intellij+all +# Edit at https://www.toptal.com/developers/gitignore?templates=intellij+all +### Intellij+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij+all Patch ### +# Ignore everything but code style settings and run configurations +# that are supposed to be shared within teams. + +.idea/* + +!.idea/codeStyles +!.idea/runConfigurations + +# End of https://www.toptal.com/developers/gitignore/api/intellij+all +/.idea/codeStyles/codeStyleConfig.xml diff --git a/drone.yml b/drone.yml new file mode 100644 index 0000000..61e3d38 --- /dev/null +++ b/drone.yml @@ -0,0 +1,25 @@ +kind: pipeline +name: default + +steps: + - name: Download concessions + image: alpine + settings: + cookie: + from_secret: cookie + commands: + - apk update && apk add wget jq tar gzip py3-pip + - pip3 install csvkit + - cd scripts + - chmod +x concessions.sh + - ./concessions.sh + - name: Release concessions data + image: plugins/gitea-release + settings: + api_key: + from_secret: api_key + base_url: https://git.cuernodehipnos.es + files: concessions.tar.gz +trigger: + branch: + - master diff --git a/scripts/AC_Componentes_Informaticos_SHA256.crt b/scripts/AC_Componentes_Informaticos_SHA256.crt new file mode 100644 index 0000000..0b70867 --- /dev/null +++ b/scripts/AC_Componentes_Informaticos_SHA256.crt @@ -0,0 +1,39 @@ +-----BEGIN CERTIFICATE----- +MIIG1jCCBL6gAwIBAgIQNMarBE42mRJRyCULbJTWwDANBgkqhkiG9w0BAQsFADA7 +MQswCQYDVQQGEwJFUzERMA8GA1UECgwIRk5NVC1SQ00xGTAXBgNVBAsMEEFDIFJB +SVogRk5NVC1SQ00wHhcNMTMwNjI0MTA1MjU5WhcNMjgwNjI0MTA1MjU5WjBHMQsw +CQYDVQQGEwJFUzERMA8GA1UECgwIRk5NVC1SQ00xJTAjBgNVBAsMHEFDIENvbXBv +bmVudGVzIEluZm9ybcOhdGljb3MwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK +AoIBAQCXVx8rdbF7/xY44CaSqzzGo5BhvzA8knxC/3KJYVzTf+CkOvMxMUDub8b0 +h38MDujm/RKZhBNOWbKhxF3U61ZVhcR9xOCciuS/soT80m3BByxAKcZsNka0jCA4 +XRkglDaAFxCHEZ06MOnvXsSOZDfPYahbQ3VFCVycJuhlHdAwSpmceQwcRYkR6YgX +wTiyzCNGivMKAmRS3dItqDOmDW/nxiDFq/Jd8VWY7GFkwbbAeqYId8FjN8zfvafu +nsB9SLFkUjPPMeqfmC7Bdh7HMxLpaOXROwH201cmlebiPkn0xSFxXFqwhhr6yN8U +QYZ3O/+xdHLrS6DS9+CJUF6d09ijAgMBAAGjggLIMIICxDASBgNVHRMBAf8ECDAG +AQH/AgEAMA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUGfhYLxTWpsybBJgIDUzX +qwCng2UwgZgGCCsGAQUFBwEBBIGLMIGIMEkGCCsGAQUFBzABhj1odHRwOi8vb2Nz +cGZubXRyY21jYS5jZXJ0LmZubXQuZXMvb2NzcGZubXRyY21jYS9PY3NwUmVzcG9u +ZGVyMDsGCCsGAQUFBzAChi9odHRwOi8vd3d3LmNlcnQuZm5tdC5lcy9jZXJ0cy9B +Q1JBSVpGTk1UUkNNLmNydDAfBgNVHSMEGDAWgBT3fcX9xOiaG3dkp/UdoMy/h2Ca +bTCB6wYDVR0gBIHjMIHgMIHdBgRVHSAAMIHUMCkGCCsGAQUFBwIBFh1odHRwOi8v +d3d3LmNlcnQuZm5tdC5lcy9kcGNzLzCBpgYIKwYBBQUHAgIwgZkMgZZTdWpldG8g +YSBsYXMgY29uZGljaW9uZXMgZGUgdXNvIGV4cHVlc3RhcyBlbiBsYSBEZWNsYXJh +Y2nDs24gZGUgUHLDoWN0aWNhcyBkZSBDZXJ0aWZpY2FjacOzbiBkZSBsYSBGTk1U +LVJDTSAoIEMvIEpvcmdlIEp1YW4sIDEwNi0yODAwOS1NYWRyaWQtRXNwYcOxYSkw +gdQGA1UdHwSBzDCByTCBxqCBw6CBwIaBkGxkYXA6Ly9sZGFwZm5tdC5jZXJ0LmZu +bXQuZXMvQ049Q1JMLE9VPUFDJTIwUkFJWiUyMEZOTVQtUkNNLE89Rk5NVC1SQ00s +Qz1FUz9hdXRob3JpdHlSZXZvY2F0aW9uTGlzdDtiaW5hcnk/YmFzZT9vYmplY3Rj +bGFzcz1jUkxEaXN0cmlidXRpb25Qb2ludIYraHR0cDovL3d3dy5jZXJ0LmZubXQu +ZXMvY3Jscy9BUkxGTk1UUkNNLmNybDANBgkqhkiG9w0BAQsFAAOCAgEAo2bsQ2xL +Dcyodieqjd+uy/lfxDw/MbrAq/ZaNFkIlcypUYamOM4vrm5rz8oLjPCoLkJ48P+n +P08Gkcl5Q6q6VFcZLia+U3gfHXrkyqToQlrtViGCGH3xA4u56XtMHGXSdk9vQ0yD +nW5f7bUEkp+uvcKewrOvNcpbIAgD4eU7gdOS0w7BagcFRBgTKBw2s3z73fRZtouJ +g/atmWYtXbBsfNjph+pCh+h5sbSyZUVzO5AemyjpYYYNMWDQrTXq+7O8zIPuPaNE +SjEexuzn+VjHG90RlUK1LygARi+Ir0opD2w6erb/hK8Eea7MFdKQ2ASqNBGJggNo +5vfPVvjHiL+Antmh7mQSKL+4YwFU64d4KK9k0C1mbJethDQFKcjTK1vMvnXFiups +IuyTqwKauo7u2zMKzY4r3VYOW9TpMyLPFIY8pII5GyNzXlL0F4nscOvduTEPEYqx +eNJfpDDPY/DO8WfxgdRTy2W3D/UoAulb+Y+nuzGGCtFQrsSMQX487R+aY0nWot/h +ajef6BcPuxhDfQrg5IafrISVmcJAplb3tXhh0sz7RbYz6jf1bke4eU5fnrTMtGlV +teUL2vjrfUPHW07kBJuaQ7sxORNV3bpHisOnHj+AriQzCn5vINpSHW6hTm7IfRkb +ltu/aQrsMuUhP7HE/v+uXe5CuboV5ubZhHU= +-----END CERTIFICATE----- diff --git a/scripts/concessions.sh b/scripts/concessions.sh new file mode 100755 index 0000000..dd71e0b --- /dev/null +++ b/scripts/concessions.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -eE + +ROWS=50000 +PAGES=0 +ND=$(date +%s%N | cut -b1-13) +USER_AGENT='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0' +F_CONCESSIONS='concessions.csv' +F_LEGAL='legal.csv' +F_TAR_GZ='concessions.tar.gz' +F_COOKIES='cookies.txt' +APP=(wget jq csvgrep tar gzip) + +function getCookies() { + echo "Getting cookies..." + + wget -qO- --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --save-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" 'https://www.infosubvenciones.es/' &> /dev/null +} + +function fetchConcessions() { + local outputFile="page_$(printf %03d "$1").json" + + echo "Downloading page $1..." + + if [ -z ${COOKIE+x} ]; then + wget --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --load-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"; + else + wget --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --header "Cookie: $COOKIE" --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc" + fi +} +export -f fetchConcessions + +function getNumPages() { + local page=1 + local outputFile="page_$(printf %03d $page).json" + + echo "Getting number of pages..." + + fetchConcessions $page + + PAGES=$(jq '.total' "$outputFile") + local records=$(jq '.records' "$outputFile") + + echo "Total pages: $PAGES" + echo "Total records: $records" + + if [[ $PAGES -le 0 ]]; then + errorHandler "Cannot get number of pages" "$LINENO" + fi +} + +function getAllConcessions() { + echo "Downloading all concessions..." + + seq 2 "$PAGES" | xargs -I page -n 1 -P 5 bash -c "fetchConcessions page" +} + +function convertJson2Csv() { + echo "Converting JSON to CSV..." + + for file in *.json; do jq -r '.rows[] | [.[]] | @csv' "$file"; done > $F_CONCESSIONS +} + +function getAllLegal() { + echo "Extracting legal concessions..." + + csvgrep --no-header-row \ + --columns 10 \ + --regex '^[A-HJP-SUV]\d{7}[0-9A-J] ' \ + $F_CONCESSIONS > $F_LEGAL +} + +function compressData() { + echo "Compressing data..." + + tar -czf $F_TAR_GZ $F_CONCESSIONS $F_LEGAL +} + +function cleanTempFiles() { + echo "Cleaning temporary files..." + + rm -f *.json $F_COOKIES +} + +function errorHandler() { + echo "Error: ($1) occurred on line $2" + cleanTempFiles + exit 1 +} + +trap 'errorHandler $? $LINENO' ERR +trap "echo -e '\nTerminated by Ctrl+c'; cleanTempFiles; exit" INT + +if [ "$(id -u)" == 0 ]; then errorHandler "Don't run this script as root, please" "$LINENO"; fi +for element in "${APP[@]}"; do + [ -z "$(whereis -b "$element" | awk {'print$2'})" ] && errorHandler "$element - Maybe it is not installed on the system. Sorry but I can't continue" "$LINENO" +done + +if [ -z ${COOKIE+x} ]; then + echo "Manual cookie is unset"; + getCookies; +else + echo "Manual cookie is set. Skipping get cookie step"; +fi + +getNumPages +getAllConcessions +convertJSON2CSV +getAllLegal +compressData +cleanTempFiles +