mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-06-29 20:05:46 +02:00
Pictures added. Tests fixed. README.md added.
This commit is contained in:
parent
4a28d67a4e
commit
ee90545bb6
49
README.md
Normal file
49
README.md
Normal file
|
@ -0,0 +1,49 @@
|
|||
#libreCATASTRO
|
||||
An opensource, MIT-licensed application that scraps the official Spanish
|
||||
Cadaster registry and stores information in Elastic Search.
|
||||
|
||||
**Features**
|
||||
|
||||
_Scrapping_
|
||||
* From XML webservices. Check http://www.catastro.meh.es/ws/Webservices_Libres.pdf
|
||||
* From HTML
|
||||
* Scraps by zone/site and by property in them
|
||||
* Scraps rural and urban properties
|
||||
* Retrieves a picture of every property
|
||||
|
||||
_Storing_
|
||||
* Stores in ElasticSearch
|
||||
* Allows visualization in Kibana
|
||||
|
||||
_Visualization_
|
||||
|
||||
Includes a configured Kibana that shows.
|
||||
1) A heatmap in the map of Spain (World) where the properties are
|
||||
2) All data in tables
|
||||
3) The picture of the property
|
||||
|
||||
**DoS Warning**
|
||||
|
||||
Spanish Cadaster has set restrictions, banning temporarily IPs that more than 10
|
||||
queries in 5 seconds. A sleep command has been set to 5sec where needed, and can be configured
|
||||
at your own risk.
|
||||
|
||||
**Installation**
|
||||
|
||||
Having Docker and Docker-compose installed, run first:
|
||||
```
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
Then configure ElasticSearch index:
|
||||
```
|
||||
python3 initialize_elasticsearch.py
|
||||
```
|
||||
|
||||
That simple!
|
||||
|
||||
**Execution**
|
||||
```
|
||||
python main.py[--coords] [--pictures] [--filenames filename1 filename2 ...] [--provinces province1 province2 ...] [--sleep sleep_time] [--html]
|
||||
```
|
||||
|
12
main.py
12
main.py
|
@ -9,12 +9,13 @@ from src.settings import config
|
|||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Runs the Cadastro Parser')
|
||||
parser.add_argument('--coords', dest='coords', default=False, action='store_true')
|
||||
parser.add_argument('--coords', action='store_true', dest='coords', default=False)
|
||||
parser.add_argument('--filenames', action='store', nargs='+', dest='filenames', default=[])
|
||||
parser.add_argument('--provinces', action='store', nargs='+', dest='provinces', default=[])
|
||||
parser.add_argument('--sleep', action='store', dest='sleep', type=int, default=5)
|
||||
parser.add_argument('--html', dest='html', default=False, action='store_true')
|
||||
parser.add_argument('--scale', action='store', dest='scale', type=int, default=10000)
|
||||
parser.add_argument('--pictures', action='store_true', dest='pictures', default='False')
|
||||
|
||||
args = parser.parse_args(sys.argv[1:])
|
||||
|
||||
|
@ -26,7 +27,12 @@ if __name__ == "__main__":
|
|||
|
||||
scrapper = ScrapperHTML if args.html else ScrapperXML
|
||||
|
||||
filenames = args.filenames
|
||||
pictures = args.pictures
|
||||
provinces = args.provinces
|
||||
|
||||
if args.coords:
|
||||
CoordinatesInput.scrap_coordinates(args.filenames, scrapper)
|
||||
CoordinatesInput.scrap_coordinates(scrapper, filenames, pictures)
|
||||
else:
|
||||
ProvincesInput.scrap_provinces(args.provinces, scrapper)
|
||||
print(pictures)
|
||||
ProvincesInput.scrap_provinces(scrapper, provinces, pictures)
|
||||
|
|
|
@ -24,35 +24,33 @@ class CadasterEntry:
|
|||
self.location = cadaster_entry.location
|
||||
self.gsurface = cadaster_entry.gsurface
|
||||
self.constructions = cadaster_entry.constructions
|
||||
self.picture = cadaster_entry.picture
|
||||
self.timestamp = cadaster_entry.timestamp
|
||||
|
||||
def to_json(self):
|
||||
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, timestamp=self.timestamp)
|
||||
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, picture=str(self.picture), timestamp=self.timestamp)
|
||||
|
||||
def to_json_recursive(self):
|
||||
return json.dumps(self.to_json(), cls=JSONEncoder, sort_keys=True,
|
||||
indent=4, separators=(',', ': '))
|
||||
|
||||
def to_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
res = None
|
||||
try:
|
||||
es = Elasticsearch()
|
||||
body = json.dumps(self.to_json(), cls=JSONEncoder,sort_keys=True,
|
||||
indent=4, separators=(',', ': '))
|
||||
#logger.debug("Sending to Elastic Search\n:{}".format(body))
|
||||
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
|
||||
#logger.debug(res)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
finally:
|
||||
es.transport.close()
|
||||
|
||||
return res
|
||||
|
||||
def from_elasticsearch(self):
|
||||
res = None
|
||||
|
||||
es = Elasticsearch()
|
||||
try:
|
||||
es = Elasticsearch()
|
||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||
except Exception as e:
|
||||
|
|
|
@ -22,6 +22,7 @@ class CadasterEntryHTML(CadasterEntry):
|
|||
self.location = Location(description_data[u'Longitud'], description_data[u'Latitud'])
|
||||
self.gsurface = description_data[u'Superficie gráfica'] if u'Superficie gráfica' in description_data else None
|
||||
self.constructions = [Construction(x) for x in description_data[u'Construcciones']]
|
||||
self.picture = description_data[u'GráficoParcela'] if u'GráficoParcela' in description_data else None
|
||||
self.timestamp = str(datetime.now())
|
||||
super().__init__(self)
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ class ScrapperHTML(Scrapper):
|
|||
|
||||
'''Catastro web services parametrized'''
|
||||
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
|
||||
|
||||
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
||||
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
||||
|
||||
|
@ -32,10 +33,9 @@ class ScrapperHTML(Scrapper):
|
|||
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción']
|
||||
gsurface_field_names = [u'Superficie gráfica']
|
||||
|
||||
""" Coordinates scrapping calls """
|
||||
|
||||
""" Scrapping calls """
|
||||
@classmethod
|
||||
def scrap_coord(cls, x, y):
|
||||
def scrap_coord(cls, x, y, pictures=False):
|
||||
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
||||
url = cls.URL.format(x, y)
|
||||
logger.debug("[||| ] URL for coordinates: {}".format(url))
|
||||
|
@ -51,19 +51,19 @@ class ScrapperHTML(Scrapper):
|
|||
else:
|
||||
logger.debug("||||| ] FOUND!")
|
||||
cadaster = ''.join([pc1.text, pc2.text])
|
||||
cadaster_entries = cls.scrap_cadaster(cadaster, x, y)
|
||||
cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures)
|
||||
for cadaster_entry in cadaster_entries:
|
||||
cadaster_entry.to_elasticsearch()
|
||||
|
||||
return cadaster_entries
|
||||
|
||||
@classmethod
|
||||
def scrap_provinces(cls, prov_list):
|
||||
def scrap_provinces(cls, prov_list, pictures=False):
|
||||
"""Scraps properties by addresses"""
|
||||
|
||||
provinces = cls.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
prov_num = province['cpine']
|
||||
|
||||
if len(prov_list) > 0 and prov_name not in prov_list:
|
||||
continue
|
||||
|
@ -71,6 +71,7 @@ class ScrapperHTML(Scrapper):
|
|||
cities = cls.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
city_num = city['locat']['cmc']
|
||||
addresses = (cls.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][
|
||||
'calle'])
|
||||
|
||||
|
@ -109,7 +110,7 @@ class ScrapperHTML(Scrapper):
|
|||
|
||||
num_scrapping_fails = 10
|
||||
|
||||
cadaster_list = cls.scrap_cadaster(cadaster_num, lon, lat)
|
||||
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
|
||||
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
|
@ -139,19 +140,82 @@ class ScrapperHTML(Scrapper):
|
|||
counter += 1
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
|
||||
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
|
||||
rc_1 = cadaster[0:7]
|
||||
rc_2 = cadaster[7:14]
|
||||
url_ref = cls.URL_REF.format(rc_1, rc_2)
|
||||
|
||||
logger.debug("[|||||||| ] URL for cadastral data: {}".format(url_ref))
|
||||
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
|
||||
if delimitacion is None:
|
||||
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
||||
if delimitacion_search:
|
||||
delimitacion = delimitacion_search.group(1)
|
||||
|
||||
if municipio is None:
|
||||
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
||||
if municipio_search:
|
||||
municipio = municipio_search.group(1)
|
||||
|
||||
picture = None
|
||||
if pictures:
|
||||
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
cadasters = []
|
||||
if description is None:
|
||||
logger.debug("Multiparcela found!")
|
||||
''' Multiparcela with multiple cadasters '''
|
||||
|
||||
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
||||
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
|
||||
for partial_cadaster in all_cadasters:
|
||||
partial_cadaster_ref = partial_cadaster.find("b")
|
||||
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y, picture)
|
||||
cadasters.append(cadaster)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
else:
|
||||
cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
|
||||
|
||||
cadasters.append(cadaster)
|
||||
|
||||
logger.debug("[|||||||||||] SUCCESS!")
|
||||
sleep(config['sleep_time'])
|
||||
return cadasters
|
||||
|
||||
""" Parsing """
|
||||
|
||||
@classmethod
|
||||
def parse_html_parcela(cls, parsed_html, x=None, y=None):
|
||||
def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None):
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
descriptive_data = dict()
|
||||
descriptive_data[u'Longitud'] = x
|
||||
descriptive_data[u'Latitud'] = y
|
||||
descriptive_data[u'GráficoParcela'] = picture
|
||||
descriptive_data[u'Construcciones'] = []
|
||||
|
||||
|
||||
''' Datos descriptivos and Parcela Catastral '''
|
||||
fields = description.find_all('div')
|
||||
for field in fields:
|
||||
|
@ -177,7 +241,7 @@ class ScrapperHTML(Scrapper):
|
|||
descriptive_data[field_name] = field_value.text.strip()
|
||||
|
||||
'''Constructions'''
|
||||
constructions_table = parsed_html.find(id='ctl00_Contenido_tblLocales');
|
||||
constructions_table = parsed_html.find(id='ctl00_Contenido_tblLocales')
|
||||
if constructions_table is None:
|
||||
constructions = []
|
||||
else:
|
||||
|
@ -193,62 +257,3 @@ class ScrapperHTML(Scrapper):
|
|||
|
||||
cadaster_entry = CadasterEntryHTML(descriptive_data)
|
||||
return cadaster_entry
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None):
|
||||
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
return ScrapperHTML.parse_html_parcela(parsed_html, x, y)
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster(cls, cadaster, x=None, y=None):
|
||||
rc_1 = cadaster[0:7]
|
||||
rc_2 = cadaster[7:14]
|
||||
url_ref = cls.URL_REF.format(rc_1, rc_2)
|
||||
|
||||
logger.debug("[|||||||| ] URL for cadastral data: {}".format(url_ref))
|
||||
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
|
||||
delimitacion = ''
|
||||
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
||||
if delimitacion_search:
|
||||
delimitacion = delimitacion_search.group(1)
|
||||
|
||||
municipio = ''
|
||||
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
||||
if municipio_search:
|
||||
municipio = municipio_search.group(1)
|
||||
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
cadasters = []
|
||||
if description is None:
|
||||
logger.debug("Multiparcela found!")
|
||||
''' Multiparcela with multiple cadasters '''
|
||||
|
||||
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
||||
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
|
||||
for partial_cadaster in all_cadasters:
|
||||
partial_cadaster_ref = partial_cadaster.find("b")
|
||||
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y)
|
||||
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
cadasters.append(cadaster)
|
||||
else:
|
||||
cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y)
|
||||
cadasters.append(cadaster)
|
||||
|
||||
logger.debug("[|||||||||||] SUCCESS!")
|
||||
sleep(config['sleep_time'])
|
||||
return cadasters
|
||||
|
|
|
@ -25,7 +25,7 @@ class ScrapperXML(Scrapper):
|
|||
""" Scrapping main calls """
|
||||
|
||||
@classmethod
|
||||
def scrap_coord(cls, x, y):
|
||||
def scrap_coord(cls, x, y, pictures=False):
|
||||
"""Scraps properties by coordinates"""
|
||||
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
|
||||
|
@ -43,15 +43,15 @@ class ScrapperXML(Scrapper):
|
|||
pc2 = xml_dict['consulta_coordenadas']['coordenadas']['coord']['pc']['pc2'] if 'pc' in xml_dict['consulta_coordenadas']['coordenadas']['coord'] else None
|
||||
if pc1 is not None and pc2 is not None:
|
||||
logger.debug("||||| ] FOUND!")
|
||||
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1,pc2]))
|
||||
|
||||
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1,pc2]))
|
||||
cadaster_entry = CadasterEntryXML(entry, x, y)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
logger.debug("[|||||||||||] SUCCESS!")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
@classmethod
|
||||
def scrap_provinces(cls, prov_list):
|
||||
def scrap_provinces(cls, prov_list, pictures=False):
|
||||
"""Scraps properties by addresses"""
|
||||
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import base64
|
||||
import urllib.parse
|
||||
from urllib.request import urlopen
|
||||
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
'''Logger'''
|
||||
|
@ -15,15 +18,17 @@ class Scrapper:
|
|||
'''Catastro web services parametrized'''
|
||||
URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
|
||||
|
||||
URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}"
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def scrap_coords(cls, x, y):
|
||||
def scrap_coords(cls, x, y, pictures=False):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def scrap_provinces(cls, prov_list):
|
||||
def scrap_provinces(cls, prov_list, pictures=False):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
|
@ -134,4 +139,19 @@ class Scrapper:
|
|||
|
||||
response = requests.get(url, params=params)
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
@classmethod
|
||||
def scrap_site_picture(cls, prov_name, city_name, cadaster):
|
||||
url_pic = cls.URL_PICTURES.format(prov_name, city_name, cadaster, config['width_px'], config['height_px'])
|
||||
|
||||
logger.debug("[|||||||| ] URL for picture data: {}".format(url_pic))
|
||||
|
||||
f_pic = urlopen(url_pic)
|
||||
|
||||
data_ref = f_pic.read()
|
||||
|
||||
b64_image = base64.b64encode(data_ref).decode('utf-8')
|
||||
|
||||
return b64_image
|
||||
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
from random import random
|
||||
import random
|
||||
from time import sleep
|
||||
|
||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
|
||||
from src.librecatastro.scrapping.input import Input
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
@ -20,7 +19,7 @@ class CoordinatesInput(Input):
|
|||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def scrap_coordinates(cls, filenames, scrapper=ScrapperHTML):
|
||||
def scrap_coordinates(cls, scrapper, filenames, pictures):
|
||||
for r, d, files in os.walk(config['coordinates_path']):
|
||||
for file in files:
|
||||
|
||||
|
@ -32,12 +31,12 @@ class CoordinatesInput(Input):
|
|||
|
||||
try:
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
|
||||
CoordinatesInput.scrap_polygon(polygon, scrapper)
|
||||
CoordinatesInput.scrap_polygon(scrapper, polygon, pictures)
|
||||
except:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
@classmethod
|
||||
def scrap_polygon(cls, polygon, scrapper):
|
||||
def scrap_polygon(cls, scrapper, polygon, pictures):
|
||||
bb = polygon.get_bounding_box()
|
||||
lon_min = int(bb[0] * config['scale'])
|
||||
lon_max = int(bb[2] * config['scale'])
|
||||
|
@ -55,8 +54,7 @@ class CoordinatesInput(Input):
|
|||
logger.info('{},{}'.format(x_scaled, y_scaled))
|
||||
|
||||
try:
|
||||
|
||||
scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
scrapper.scrap_coord(x_scaled, y_scaled, pictures)
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
|
@ -74,108 +72,29 @@ class CoordinatesInput(Input):
|
|||
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
start_time = time.time()
|
||||
results = []
|
||||
@staticmethod
|
||||
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
start_time = time.time()
|
||||
results = []
|
||||
|
||||
finished = False
|
||||
for x in range(lon_min, lon_max):
|
||||
for y in range(lat_min, lat_max):
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
now = time.time()
|
||||
elapsed_time = now - start_time
|
||||
if elapsed_time > seconds:
|
||||
finished = True
|
||||
break
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
if finished:
|
||||
break
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
results = []
|
||||
counter = times
|
||||
|
||||
finished = False
|
||||
for x in range(lon_min, lon_max):
|
||||
for y in range(lat_min, lat_max):
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
|
||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
finished = True
|
||||
break
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
if finished:
|
||||
break
|
||||
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
results = []
|
||||
counter = times
|
||||
while counter > 0:
|
||||
x = random.randrange(lon_min, lon_max)
|
||||
y = random.randrange(lat_min, lat_max)
|
||||
finished = False
|
||||
for x in range(lon_min, lon_max):
|
||||
for y in range(lat_min, lat_max):
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if len(cadaster_entry) > 0:
|
||||
results.append(cadaster_entry)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
now = time.time()
|
||||
elapsed_time = now - start_time
|
||||
if elapsed_time > seconds:
|
||||
finished = True
|
||||
break
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
|
@ -190,6 +109,85 @@ class CoordinatesInput(Input):
|
|||
logger.error("=============================================")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
logger.debug("====PROCESSING FINISHED====")
|
||||
logger.debug("Results found: {}".format(times))
|
||||
return ListUtils.flat(results)
|
||||
if finished:
|
||||
break
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
results = []
|
||||
counter = times
|
||||
|
||||
finished = False
|
||||
for x in range(lon_min, lon_max):
|
||||
for y in range(lat_min, lat_max):
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
|
||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
finished = True
|
||||
break
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
if finished:
|
||||
break
|
||||
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
results = []
|
||||
counter = times
|
||||
while counter > 0:
|
||||
x = random.randrange(lon_min, lon_max)
|
||||
y = random.randrange(lat_min, lat_max)
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if len(cadaster_entry) > 0:
|
||||
results.append(cadaster_entry)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
break
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
logger.debug("====PROCESSING FINISHED====")
|
||||
logger.debug("Results found: {}".format(times))
|
||||
return ListUtils.flat(results)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
|
||||
from src.librecatastro.scrapping.input import Input
|
||||
|
||||
|
||||
|
@ -7,5 +6,5 @@ class ProvincesInput(Input):
|
|||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def scrap_provinces(cls, prov_list, scrapper=ScrapperXML):
|
||||
scrapper.scrap_provinces(prov_list)
|
||||
def scrap_provinces(cls, scrapper, prov_list, pictures=False):
|
||||
scrapper.scrap_provinces(prov_list, pictures)
|
||||
|
|
|
@ -11,5 +11,7 @@ config = {
|
|||
"coordinates_path": os.path.join(root_path, 'coordinates'),
|
||||
"not_available_via_XML": "(Not available via XML)",
|
||||
"sleep_time": 5,
|
||||
"sleep_dos_time": 300
|
||||
"sleep_dos_time": 300,
|
||||
"width_px": 120,
|
||||
"height_px": 120
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ import unittest
|
|||
|
||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
|
||||
from src.librecatastro.scrapping.source.coordinates_input import CoordinatesInput
|
||||
from src.settings import config
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
|
||||
|
@ -23,7 +24,7 @@ class ScrapperHTMLTests(unittest.TestCase):
|
|||
cadaster = cadaster_list[0]
|
||||
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
|
||||
|
||||
def test_coordinate_multiparcela_creates_cadaster_2(self):
|
||||
def test_coordinate_multiparcela_creates_cadaster(self):
|
||||
cadaster_list = ScrapperHTML.scrap_coord(-0.33, 39.47)
|
||||
self.assertTrue(len(cadaster_list) > 1)
|
||||
|
||||
|
@ -88,7 +89,7 @@ class ScrapperHTMLTests(unittest.TestCase):
|
|||
def scrap_random_until_x_times_found(self, times):
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||
coord = polygon.get_bounding_box()
|
||||
cadaster_list = ScrapperHTML.scrap_results_random_x_times(times, int(coord[0]*config['scale']), int(coord[2]*config['scale']), int(coord[1]*config['scale']), int(coord[3]*config['scale']))
|
||||
cadaster_list = CoordinatesInput.scrap_results_random_x_times(times, int(coord[0]*config['scale']), int(coord[2]*config['scale']), int(coord[1]*config['scale']), int(coord[3]*config['scale']), ScrapperHTML)
|
||||
self.assertTrue(len(cadaster_list) >= times)
|
||||
return cadaster_list
|
||||
|
||||
|
@ -119,6 +120,10 @@ class ScrapperHTMLTests(unittest.TestCase):
|
|||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||
self.assertIsNotNone(polygon.get_bounding_box())
|
||||
|
||||
def test_if_pictures_enabled_picture_is_set(self):
|
||||
cadaster_list = ScrapperHTML.scrap_cadaster('06145A00500028', pictures=True)
|
||||
self.assertIsNotNone(cadaster_list[0].picture)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -8,142 +8,26 @@ from src.settings import config
|
|||
|
||||
class ScrapperXMLTests(unittest.TestCase):
|
||||
def test_scrapper_retrieves_dict_provinces(self):
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
self.assertTrue(len(provinces) == 48)
|
||||
self.assertEqual(ScrapperXML.get_provinces()['consulta_provinciero']['control']['cuprov'], '48')
|
||||
|
||||
def test_scrapper_retrieves_dict_cities(self):
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
cities = ScrapperXML.get_cities(prov_name)
|
||||
self.assertTrue(len(cities['consulta_municipiero']['municipiero']['muni']) > 0)
|
||||
self.assertEqual(ScrapperXML.get_cities('ALACANT')['consulta_municipiero']['control']['cumun'],'141')
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_scrapper_retrieves_dict_addresses(self):
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
addresses = ScrapperXML.get_addresses(prov_name, city_name)
|
||||
self.assertTrue(len(addresses['consulta_callejero']['callejero']['calle']) > 0)
|
||||
return
|
||||
|
||||
def test_scrapper_retrieves_dict_properties(self):
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
addresses = ScrapperXML.get_addresses(prov_name, city_name)['consulta_callejero']['callejero']['calle']
|
||||
for address in addresses:
|
||||
address_dir = address['dir']
|
||||
tv = address_dir['tv']
|
||||
nv = address_dir['nv']
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
matches = 0
|
||||
while num_scrapping_fails > 0:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
if 'lerr' in cadaster['consulta_numerero'] and \
|
||||
'err' in cadaster['consulta_numerero']['lerr'] and \
|
||||
'cod' in cadaster['consulta_numerero']['lerr']['err'] and\
|
||||
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
num_scrapping_fails = 10
|
||||
matches += 1
|
||||
|
||||
counter += 1
|
||||
sleep(5)
|
||||
|
||||
self.assertTrue(matches > 0)
|
||||
return
|
||||
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST')['consulta_callejero']['control']['cuca'], '117')
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_scrapper_creates_cadaster_entry(self):
|
||||
results = []
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
addresses = ScrapperXML.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][
|
||||
'calle']
|
||||
for address in addresses:
|
||||
address_dir = address['dir']
|
||||
tv = address_dir['tv']
|
||||
nv = address_dir['nv']
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
while num_scrapping_fails > 0:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
if 'lerr' in cadaster['consulta_numerero'] and \
|
||||
'err' in cadaster['consulta_numerero']['lerr'] and \
|
||||
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
|
||||
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
num = cadaster['consulta_numerero']['numerero']['nump']['num']['pnp']
|
||||
cadaster_num = cadaster['consulta_numerero']['numerero']['nump']['pc']['pc1'] + \
|
||||
cadaster['consulta_numerero']['numerero']['nump']['pc']['pc2']
|
||||
|
||||
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
||||
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
|
||||
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
|
||||
num_scrapping_fails = 10
|
||||
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
cadaster_entry = CadasterEntryXML(entry, lon, lat)
|
||||
results.append(cadaster_entry)
|
||||
|
||||
counter += 1
|
||||
sleep(5)
|
||||
|
||||
self.assertTrue(len(results) > 1)
|
||||
return
|
||||
print(ScrapperXML.get_cadaster_entries_by_cadaster('','', '6375620YH0567S0001GW'))
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
addresses = ScrapperXML.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][
|
||||
'calle']
|
||||
for address in addresses:
|
||||
address_dir = address['dir']
|
||||
tv = address_dir['tv']
|
||||
nv = address_dir['nv']
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
while num_scrapping_fails > 0:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
if 'lerr' in cadaster['consulta_numerero'] and \
|
||||
'err' in cadaster['consulta_numerero']['lerr'] and \
|
||||
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
|
||||
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
num = cadaster['consulta_numerero']['numerero']['nump']['num']['pnp']
|
||||
cadaster_num = cadaster['consulta_numerero']['numerero']['nump']['pc']['pc1'] + \
|
||||
cadaster['consulta_numerero']['numerero']['nump']['pc']['pc2']
|
||||
|
||||
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
||||
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
|
||||
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
|
||||
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
cadaster_entry = CadasterEntryXML(entry, lon, lat)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
return
|
||||
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
|
||||
cadaster_entry = CadasterEntryXML(entry, None, None)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
|
||||
prov_name = u'A CORUÑA'
|
||||
|
|
|
@ -27,8 +27,23 @@ class ElasticSearchUtils:
|
|||
"location": {
|
||||
"type": "geo_point"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"dynamic_templates": [
|
||||
{
|
||||
"strings": {
|
||||
"match_mapping_type": "string",
|
||||
"mapping": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
}
|
||||
logger.debug("Creating 'cadaster' index...")
|
||||
|
|
Loading…
Reference in New Issue
Block a user