Add concessions download script
This commit is contained in:
parent
a4819c9c4b
commit
a38da53f8a
|
@ -1,2 +1,93 @@
|
||||||
.DS_Store
|
# Created by https://www.toptal.com/developers/gitignore/api/intellij+all
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=intellij+all
|
||||||
|
|
||||||
|
### Intellij+all ###
|
||||||
|
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||||
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||||
|
|
||||||
|
# User-specific stuff
|
||||||
|
.idea/**/workspace.xml
|
||||||
|
.idea/**/tasks.xml
|
||||||
|
.idea/**/usage.statistics.xml
|
||||||
|
.idea/**/dictionaries
|
||||||
|
.idea/**/shelf
|
||||||
|
|
||||||
|
# AWS User-specific
|
||||||
|
.idea/**/aws.xml
|
||||||
|
|
||||||
|
# Generated files
|
||||||
|
.idea/**/contentModel.xml
|
||||||
|
|
||||||
|
# Sensitive or high-churn files
|
||||||
|
.idea/**/dataSources/
|
||||||
|
.idea/**/dataSources.ids
|
||||||
|
.idea/**/dataSources.local.xml
|
||||||
|
.idea/**/sqlDataSources.xml
|
||||||
|
.idea/**/dynamic.xml
|
||||||
|
.idea/**/uiDesigner.xml
|
||||||
|
.idea/**/dbnavigator.xml
|
||||||
|
|
||||||
|
# Gradle
|
||||||
|
.idea/**/gradle.xml
|
||||||
|
.idea/**/libraries
|
||||||
|
|
||||||
|
# Gradle and Maven with auto-import
|
||||||
|
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||||
|
# since they will be recreated, and may cause churn. Uncomment if using
|
||||||
|
# auto-import.
|
||||||
|
# .idea/artifacts
|
||||||
|
# .idea/compiler.xml
|
||||||
|
# .idea/jarRepositories.xml
|
||||||
|
# .idea/modules.xml
|
||||||
|
# .idea/*.iml
|
||||||
|
# .idea/modules
|
||||||
|
# *.iml
|
||||||
|
# *.ipr
|
||||||
|
|
||||||
|
# CMake
|
||||||
|
cmake-build-*/
|
||||||
|
|
||||||
|
# Mongo Explorer plugin
|
||||||
|
.idea/**/mongoSettings.xml
|
||||||
|
|
||||||
|
# File-based project format
|
||||||
|
*.iws
|
||||||
|
|
||||||
|
# IntelliJ
|
||||||
|
out/
|
||||||
|
|
||||||
|
# mpeltonen/sbt-idea plugin
|
||||||
|
.idea_modules/
|
||||||
|
|
||||||
|
# JIRA plugin
|
||||||
|
atlassian-ide-plugin.xml
|
||||||
|
|
||||||
|
# Cursive Clojure plugin
|
||||||
|
.idea/replstate.xml
|
||||||
|
|
||||||
|
# SonarLint plugin
|
||||||
|
.idea/sonarlint/
|
||||||
|
|
||||||
|
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||||
|
com_crashlytics_export_strings.xml
|
||||||
|
crashlytics.properties
|
||||||
|
crashlytics-build.properties
|
||||||
|
fabric.properties
|
||||||
|
|
||||||
|
# Editor-based Rest Client
|
||||||
|
.idea/httpRequests
|
||||||
|
|
||||||
|
# Android studio 3.1+ serialized cache file
|
||||||
|
.idea/caches/build_file_checksums.ser
|
||||||
|
|
||||||
|
### Intellij+all Patch ###
|
||||||
|
# Ignore everything but code style settings and run configurations
|
||||||
|
# that are supposed to be shared within teams.
|
||||||
|
|
||||||
|
.idea/*
|
||||||
|
|
||||||
|
!.idea/codeStyles
|
||||||
|
!.idea/runConfigurations
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/intellij+all
|
||||||
|
/.idea/codeStyles/codeStyleConfig.xml
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
kind: pipeline
|
||||||
|
name: default
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Download concessions
|
||||||
|
image: alpine
|
||||||
|
settings:
|
||||||
|
cookie:
|
||||||
|
from_secret: cookie
|
||||||
|
commands:
|
||||||
|
- apk update && apk add wget jq tar gzip py3-pip
|
||||||
|
- pip3 install csvkit
|
||||||
|
- cd scripts
|
||||||
|
- chmod +x concessions.sh
|
||||||
|
- ./concessions.sh
|
||||||
|
- name: Release concessions data
|
||||||
|
image: plugins/gitea-release
|
||||||
|
settings:
|
||||||
|
api_key:
|
||||||
|
from_secret: api_key
|
||||||
|
base_url: https://git.cuernodehipnos.es
|
||||||
|
files: concessions.tar.gz
|
||||||
|
trigger:
|
||||||
|
branch:
|
||||||
|
- master
|
|
@ -0,0 +1,39 @@
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIG1jCCBL6gAwIBAgIQNMarBE42mRJRyCULbJTWwDANBgkqhkiG9w0BAQsFADA7
|
||||||
|
MQswCQYDVQQGEwJFUzERMA8GA1UECgwIRk5NVC1SQ00xGTAXBgNVBAsMEEFDIFJB
|
||||||
|
SVogRk5NVC1SQ00wHhcNMTMwNjI0MTA1MjU5WhcNMjgwNjI0MTA1MjU5WjBHMQsw
|
||||||
|
CQYDVQQGEwJFUzERMA8GA1UECgwIRk5NVC1SQ00xJTAjBgNVBAsMHEFDIENvbXBv
|
||||||
|
bmVudGVzIEluZm9ybcOhdGljb3MwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK
|
||||||
|
AoIBAQCXVx8rdbF7/xY44CaSqzzGo5BhvzA8knxC/3KJYVzTf+CkOvMxMUDub8b0
|
||||||
|
h38MDujm/RKZhBNOWbKhxF3U61ZVhcR9xOCciuS/soT80m3BByxAKcZsNka0jCA4
|
||||||
|
XRkglDaAFxCHEZ06MOnvXsSOZDfPYahbQ3VFCVycJuhlHdAwSpmceQwcRYkR6YgX
|
||||||
|
wTiyzCNGivMKAmRS3dItqDOmDW/nxiDFq/Jd8VWY7GFkwbbAeqYId8FjN8zfvafu
|
||||||
|
nsB9SLFkUjPPMeqfmC7Bdh7HMxLpaOXROwH201cmlebiPkn0xSFxXFqwhhr6yN8U
|
||||||
|
QYZ3O/+xdHLrS6DS9+CJUF6d09ijAgMBAAGjggLIMIICxDASBgNVHRMBAf8ECDAG
|
||||||
|
AQH/AgEAMA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUGfhYLxTWpsybBJgIDUzX
|
||||||
|
qwCng2UwgZgGCCsGAQUFBwEBBIGLMIGIMEkGCCsGAQUFBzABhj1odHRwOi8vb2Nz
|
||||||
|
cGZubXRyY21jYS5jZXJ0LmZubXQuZXMvb2NzcGZubXRyY21jYS9PY3NwUmVzcG9u
|
||||||
|
ZGVyMDsGCCsGAQUFBzAChi9odHRwOi8vd3d3LmNlcnQuZm5tdC5lcy9jZXJ0cy9B
|
||||||
|
Q1JBSVpGTk1UUkNNLmNydDAfBgNVHSMEGDAWgBT3fcX9xOiaG3dkp/UdoMy/h2Ca
|
||||||
|
bTCB6wYDVR0gBIHjMIHgMIHdBgRVHSAAMIHUMCkGCCsGAQUFBwIBFh1odHRwOi8v
|
||||||
|
d3d3LmNlcnQuZm5tdC5lcy9kcGNzLzCBpgYIKwYBBQUHAgIwgZkMgZZTdWpldG8g
|
||||||
|
YSBsYXMgY29uZGljaW9uZXMgZGUgdXNvIGV4cHVlc3RhcyBlbiBsYSBEZWNsYXJh
|
||||||
|
Y2nDs24gZGUgUHLDoWN0aWNhcyBkZSBDZXJ0aWZpY2FjacOzbiBkZSBsYSBGTk1U
|
||||||
|
LVJDTSAoIEMvIEpvcmdlIEp1YW4sIDEwNi0yODAwOS1NYWRyaWQtRXNwYcOxYSkw
|
||||||
|
gdQGA1UdHwSBzDCByTCBxqCBw6CBwIaBkGxkYXA6Ly9sZGFwZm5tdC5jZXJ0LmZu
|
||||||
|
bXQuZXMvQ049Q1JMLE9VPUFDJTIwUkFJWiUyMEZOTVQtUkNNLE89Rk5NVC1SQ00s
|
||||||
|
Qz1FUz9hdXRob3JpdHlSZXZvY2F0aW9uTGlzdDtiaW5hcnk/YmFzZT9vYmplY3Rj
|
||||||
|
bGFzcz1jUkxEaXN0cmlidXRpb25Qb2ludIYraHR0cDovL3d3dy5jZXJ0LmZubXQu
|
||||||
|
ZXMvY3Jscy9BUkxGTk1UUkNNLmNybDANBgkqhkiG9w0BAQsFAAOCAgEAo2bsQ2xL
|
||||||
|
Dcyodieqjd+uy/lfxDw/MbrAq/ZaNFkIlcypUYamOM4vrm5rz8oLjPCoLkJ48P+n
|
||||||
|
P08Gkcl5Q6q6VFcZLia+U3gfHXrkyqToQlrtViGCGH3xA4u56XtMHGXSdk9vQ0yD
|
||||||
|
nW5f7bUEkp+uvcKewrOvNcpbIAgD4eU7gdOS0w7BagcFRBgTKBw2s3z73fRZtouJ
|
||||||
|
g/atmWYtXbBsfNjph+pCh+h5sbSyZUVzO5AemyjpYYYNMWDQrTXq+7O8zIPuPaNE
|
||||||
|
SjEexuzn+VjHG90RlUK1LygARi+Ir0opD2w6erb/hK8Eea7MFdKQ2ASqNBGJggNo
|
||||||
|
5vfPVvjHiL+Antmh7mQSKL+4YwFU64d4KK9k0C1mbJethDQFKcjTK1vMvnXFiups
|
||||||
|
IuyTqwKauo7u2zMKzY4r3VYOW9TpMyLPFIY8pII5GyNzXlL0F4nscOvduTEPEYqx
|
||||||
|
eNJfpDDPY/DO8WfxgdRTy2W3D/UoAulb+Y+nuzGGCtFQrsSMQX487R+aY0nWot/h
|
||||||
|
ajef6BcPuxhDfQrg5IafrISVmcJAplb3tXhh0sz7RbYz6jf1bke4eU5fnrTMtGlV
|
||||||
|
teUL2vjrfUPHW07kBJuaQ7sxORNV3bpHisOnHj+AriQzCn5vINpSHW6hTm7IfRkb
|
||||||
|
ltu/aQrsMuUhP7HE/v+uXe5CuboV5ubZhHU=
|
||||||
|
-----END CERTIFICATE-----
|
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -eE
|
||||||
|
|
||||||
|
ROWS=50000
|
||||||
|
PAGES=0
|
||||||
|
ND=$(date +%s%N | cut -b1-13)
|
||||||
|
USER_AGENT='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
|
||||||
|
F_CONCESSIONS='concessions.csv'
|
||||||
|
F_LEGAL='legal.csv'
|
||||||
|
F_TAR_GZ='concessions.tar.gz'
|
||||||
|
F_COOKIES='cookies.txt'
|
||||||
|
APP=(wget jq csvgrep tar gzip)
|
||||||
|
|
||||||
|
function getCookies() {
|
||||||
|
echo "Getting cookies..."
|
||||||
|
|
||||||
|
wget -qO- --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --save-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" 'https://www.infosubvenciones.es/' &> /dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
function fetchConcessions() {
|
||||||
|
local outputFile="page_$(printf %03d "$1").json"
|
||||||
|
|
||||||
|
echo "Downloading page $1..."
|
||||||
|
|
||||||
|
if [ -z ${COOKIE+x} ]; then
|
||||||
|
wget --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --load-cookies $F_COOKIES --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc";
|
||||||
|
else
|
||||||
|
wget --ca-certificate=AC_Componentes_Informaticos_SHA256.crt --keep-session-cookies --header "Cookie: $COOKIE" --header "User-Agent: $USER_AGENT" -O "$outputFile" "https://www.infosubvenciones.es/bdnstrans/busqueda?type=concs&_search=false&nd=$ND&rows=$ROWS&page=$1&sidx=8&sord=asc"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
export -f fetchConcessions
|
||||||
|
|
||||||
|
function getNumPages() {
|
||||||
|
local page=1
|
||||||
|
local outputFile="page_$(printf %03d $page).json"
|
||||||
|
|
||||||
|
echo "Getting number of pages..."
|
||||||
|
|
||||||
|
fetchConcessions $page
|
||||||
|
|
||||||
|
PAGES=$(jq '.total' "$outputFile")
|
||||||
|
local records=$(jq '.records' "$outputFile")
|
||||||
|
|
||||||
|
echo "Total pages: $PAGES"
|
||||||
|
echo "Total records: $records"
|
||||||
|
|
||||||
|
if [[ $PAGES -le 0 ]]; then
|
||||||
|
errorHandler "Cannot get number of pages" "$LINENO"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function getAllConcessions() {
|
||||||
|
echo "Downloading all concessions..."
|
||||||
|
|
||||||
|
seq 2 "$PAGES" | xargs -I page -n 1 -P 5 bash -c "fetchConcessions page"
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertJson2Csv() {
|
||||||
|
echo "Converting JSON to CSV..."
|
||||||
|
|
||||||
|
for file in *.json; do jq -r '.rows[] | [.[]] | @csv' "$file"; done > $F_CONCESSIONS
|
||||||
|
}
|
||||||
|
|
||||||
|
function getAllLegal() {
|
||||||
|
echo "Extracting legal concessions..."
|
||||||
|
|
||||||
|
csvgrep --no-header-row \
|
||||||
|
--columns 10 \
|
||||||
|
--regex '^[A-HJP-SUV]\d{7}[0-9A-J] ' \
|
||||||
|
$F_CONCESSIONS > $F_LEGAL
|
||||||
|
}
|
||||||
|
|
||||||
|
function compressData() {
|
||||||
|
echo "Compressing data..."
|
||||||
|
|
||||||
|
tar -czf $F_TAR_GZ $F_CONCESSIONS $F_LEGAL
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanTempFiles() {
|
||||||
|
echo "Cleaning temporary files..."
|
||||||
|
|
||||||
|
rm -f *.json $F_COOKIES
|
||||||
|
}
|
||||||
|
|
||||||
|
function errorHandler() {
|
||||||
|
echo "Error: ($1) occurred on line $2"
|
||||||
|
cleanTempFiles
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
trap 'errorHandler $? $LINENO' ERR
|
||||||
|
trap "echo -e '\nTerminated by Ctrl+c'; cleanTempFiles; exit" INT
|
||||||
|
|
||||||
|
if [ "$(id -u)" == 0 ]; then errorHandler "Don't run this script as root, please" "$LINENO"; fi
|
||||||
|
for element in "${APP[@]}"; do
|
||||||
|
[ -z "$(whereis -b "$element" | awk {'print$2'})" ] && errorHandler "$element - Maybe it is not installed on the system. Sorry but I can't continue" "$LINENO"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z ${COOKIE+x} ]; then
|
||||||
|
echo "Manual cookie is unset";
|
||||||
|
getCookies;
|
||||||
|
else
|
||||||
|
echo "Manual cookie is set. Skipping get cookie step";
|
||||||
|
fi
|
||||||
|
|
||||||
|
getNumPages
|
||||||
|
getAllConcessions
|
||||||
|
convertJSON2CSV
|
||||||
|
getAllLegal
|
||||||
|
compressData
|
||||||
|
cleanTempFiles
|
||||||
|
|
Loading…
Reference in New Issue