Adds XML multiparcela. Fixes several bugs.

This commit is contained in:
J 2019-09-20 19:15:32 +02:00
parent ee90545bb6
commit d5b280f6eb
8 changed files with 431 additions and 243 deletions

View File

@ -34,5 +34,4 @@ if __name__ == "__main__":
if args.coords:
CoordinatesInput.scrap_coordinates(scrapper, filenames, pictures)
else:
print(pictures)
ProvincesInput.scrap_provinces(scrapper, provinces, pictures)

View File

@ -1,3 +1,4 @@
dotmap
shapely
beautifulsoup4==4.8.0
elasticsearch>=6.0.0,<7.0.0

View File

@ -26,9 +26,10 @@ class CadasterEntry:
self.constructions = cadaster_entry.constructions
self.picture = cadaster_entry.picture
self.timestamp = cadaster_entry.timestamp
logger.debug(self.to_json_recursive())
def to_json(self):
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, picture=str(self.picture), timestamp=self.timestamp)
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, picture=str(self.picture) if self.picture is not None else None, timestamp=self.timestamp)
def to_json_recursive(self):
return json.dumps(self.to_json(), cls=JSONEncoder, sort_keys=True,
@ -43,8 +44,9 @@ class CadasterEntry:
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
except Exception as e:
logger.error(e)
finally:
es.transport.close()
es.transport.close()
return res
def from_elasticsearch(self):
@ -55,7 +57,7 @@ class CadasterEntry:
res = es.search(index=config['elasticsearch-index'], body=query)
except Exception as e:
logger.error(e)
finally:
es.transport.close()
es.transport.close()
return res

View File

@ -1,6 +1,8 @@
import json
from datetime import datetime
from dotmap import DotMap
from src.librecatastro.domain.address import Address
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
from src.librecatastro.domain.construction import Construction
@ -14,33 +16,48 @@ logger = CadastroLogger(__name__).logger
class CadasterEntryXML(CadasterEntry):
"""Cadaster class, that stores all the information about a surface and its properties"""
def __init__(self, xml, lon, lat):
def __init__(self, xml, lon=None, lat=None, picture=None):
self.address = None
if xml.consulta_dnp.bico.bi.ldt != DotMap():
self.address = Address(xml.consulta_dnp.bico.bi.ldt)
self.address = Address(xml['consulta_dnp']['bico']['bi']['ldt'])
self.cadaster = xml.consulta_dnp.bico.bi.idbi.rc.pc1 if xml.consulta_dnp.bico.bi.idbi.rc.pc1 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.pc2 if xml.consulta_dnp.bico.bi.idbi.rc.pc2 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.car if xml.consulta_dnp.bico.bi.idbi.rc.car != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.cc1 if xml.consulta_dnp.bico.bi.idbi.rc.cc1 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.cc2 if xml.consulta_dnp.bico.bi.idbi.rc.cc2 != DotMap() else ''
self.cadaster = xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc1'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2']
self.year = None
if xml.consulta_dnp.bico.bi.debi is not None:
self.year = xml.consulta_dnp.bico.bi.debi.ant
if self.year == DotMap():
self.year = None
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant'] \
if 'debi' in xml['consulta_dnp']['bico']['bi'] and\
'ant' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.type = xml.consulta_dnp.bico.bi.idbi.cn
if self.type != DotMap() and self.type == 'UR':
self.type = u'Urbano'
else:
self.type = u'Rústico'
self.type = xml['consulta_dnp']['bico']['bi']['idbi']['cn'] if 'cn' in xml['consulta_dnp']['bico']['bi']['idbi'] else None
if self.type is not None:
self.type = u'Urbano' if self.type == 'UR' else u'Rústico'
self.use = None
if xml.consulta_dnp.bico.bi.debi is not None:
self.use = xml.consulta_dnp.bico.bi.debi.luso
if self.use == DotMap():
self.use = None
self.surface = None
if xml.consulta_dnp.bico.bi.debi is not None:
self.surface = xml.consulta_dnp.bico.bi.debi.sfc + 'm2'
if self.surface == DotMap():
self.surface = None
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso'] if 'luso' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2' if 'sfc' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.location = Location(lon, lat)
self.gsurface = config['not_available_via_XML']
self.constructions = []
constructions = []
if 'lcons' in xml['consulta_dnp']['bico']:
constructions = xml['consulta_dnp']['bico']['lcons']['cons']
if xml.consulta_dnp.bico.lcons.cons != DotMap():
constructions = xml.consulta_dnp.bico.lcons.cons
''' Bad XML design, instead of returning a list with 1 element, it returns
the element'''
@ -48,11 +65,26 @@ class CadasterEntryXML(CadasterEntry):
constructions = [constructions]
for construction in constructions:
use = construction['lcd'] if 'lcd' in construction else None
doorway = construction['dt']['lourb']['loint']['es'] if 'dt' in construction else None
floor = construction['dt']['lourb']['loint']['pt'] if 'dt' in construction else None
door = construction['dt']['lourb']['loint']['pu'] if 'dt' in construction else None
surface = construction['dfcons']['stl'] if 'dfcons' in construction and 'stl' in construction['dfcons'] else None
use = construction.lcd
if use == DotMap():
use = None
doorway = construction.dt.lourb.loint.es
if doorway == DotMap():
doorway = None
floor = construction.dt.lourb.loint.pt
if floor == DotMap():
floor = None
door = construction.dt.lourb.loint.pu
if door == DotMap():
door = None
surface = construction.dfcons.stl
if surface == DotMap():
surface = None
reform_type = config['not_available_via_XML']
reform_date = config['not_available_via_XML']
@ -60,5 +92,6 @@ class CadasterEntryXML(CadasterEntry):
dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type,
fecha=reform_date)))
self.picture = picture
self.timestamp = str(datetime.now())
super().__init__(self)
super().__init__(self)

View File

@ -5,6 +5,7 @@ from time import sleep
from urllib.request import urlopen
from xml.etree import ElementTree
from bs4 import BeautifulSoup
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.librecatastro.scrapping.scrapper import Scrapper
@ -12,7 +13,6 @@ from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
@ -30,15 +30,17 @@ class ScrapperHTML(Scrapper):
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
'''Information to scrap from HTML'''
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción']
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
u'Superficie construida', u'Año construcción']
gsurface_field_names = [u'Superficie gráfica']
""" Scrapping calls """
@classmethod
def scrap_coord(cls, x, y, pictures=False):
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
url = cls.URL.format(x, y)
logger.debug("[||| ] URL for coordinates: {}".format(url))
logger.debug("URL for coordinates: {}".format(url))
f = urlopen(url)
data = f.read()
root = ElementTree.fromstring(data)
@ -46,100 +48,99 @@ class ScrapperHTML(Scrapper):
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
pc2 = root.find(
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc2")
if pc1 is None or pc2 is None:
return []
else:
logger.debug("||||| ] FOUND!")
results = []
if pc1 is not None and pc2 is not None:
cadaster = ''.join([pc1.text, pc2.text])
cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures)
for cadaster_entry in cadaster_entries:
cadaster_entry.to_elasticsearch()
return cadaster_entries
results.append(cadaster_entry)
return results
@classmethod
def scrap_provinces(cls, prov_list, pictures=False):
"""Scraps properties by addresses"""
provinces = cls.get_provinces()['consulta_provinciero']['provinciero']['prov']
for province in provinces:
prov_name = province['np']
prov_num = province['cpine']
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
if len(prov_list) > 0 and prov_name not in prov_list:
if tv == DotMap() or nv == DotMap():
continue
cities = cls.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
for city in cities:
city_name = city['nm']
city_num = city['locat']['cmc']
addresses = (cls.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][
'calle'])
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:
try:
numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
num_scrapping_fails -= 1
else:
for address in addresses:
address_dir = address['dir']
tv = address_dir['tv']
nv = address_dir['nv']
numps = numerero_map.consulta_numerero.numerero.nump
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:
try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if 'lerr' in cadaster['consulta_numerero'] and \
'err' in cadaster['consulta_numerero']['lerr'] and \
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
num_scrapping_fails -= 1
else:
logger.debug("||||| ] FOUND!")
numps = cadaster['consulta_numerero']['numerero']['nump']
if not isinstance(numps, list):
numps = [numps]
if not isinstance(numps, list):
numps = [numps]
for nump in numps:
if nump.num.pnp == DotMap():
continue
for nump in numps:
num = nump['num']['pnp']
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2']
num = nump.num.pnp
coords = cls.get_coords_from_cadaster(prov_name, city_name,cadaster_num)
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
if nump.pc == DotMap():
continue
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
if nump.pc.pc1 == DotMap() or nump.pc.pc2 == DotMap():
continue
num_scrapping_fails = 10
cadaster_num = nump.pc.pc1 + nump.pc.pc2
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap():
lon = None
counter += 1
sleep(config['sleep_time'])
lat = coords_map.consulta_coordenadas.coordenadas.coord.geo.ycen
if lat == DotMap():
lat = None
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
num_scrapping_fails = 10
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
counter += 1
sleep(config['sleep_time'])
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
counter += 1
sleep(config['sleep_time'])
@classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
@ -150,13 +151,14 @@ class ScrapperHTML(Scrapper):
parsed_html = BeautifulSoup(html, features="html.parser")
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2)
logger.debug("[|||||||| ] URL for cadastral data: {}".format(url_ref))
logger.debug("URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
@ -191,7 +193,8 @@ class ScrapperHTML(Scrapper):
partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip()
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y, picture)
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y,
picture)
cadasters.append(cadaster)
sleep(config['sleep_time'])
@ -200,12 +203,10 @@ class ScrapperHTML(Scrapper):
cadasters.append(cadaster)
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time'])
return cadasters
""" Parsing """
@classmethod
def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None):
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
@ -229,7 +230,7 @@ class ScrapperHTML(Scrapper):
descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0]
descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0]
elif field_header.text == u'Localización':
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>', config['separator']).replace('<br>', config['separator'])
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',config['separator']).replace('<br>', config['separator'])
'''Graphical Surface'''
fields = parsed_html.find(id='ctl00_Contenido_tblFinca').find_all('div')
@ -253,7 +254,9 @@ class ScrapperHTML(Scrapper):
continue
columns = construction.find_all('span')
descriptive_data[u'Construcciones'].append(dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text, superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
descriptive_data[u'Construcciones'].append(
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
cadaster_entry = CadasterEntryHTML(descriptive_data)
return cadaster_entry

View File

@ -1,4 +1,3 @@
import json
import urllib.parse
from urllib import error
@ -12,6 +11,8 @@ from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap
'''Logger'''
logger = CadastroLogger(__name__).logger
@ -20,133 +21,191 @@ class ScrapperXML(Scrapper):
"""Scrapper class for Catastro XML"""
def __init__(self):
pass
super().__init__()
""" Scrapping main calls """
@classmethod
def scrap_coord(cls, x, y, pictures=False):
"""Scraps properties by coordinates"""
results = []
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params)
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
logger.debug("[||| ] URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
xml = response.content
xml_dict = xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
pc1 = None
pc2 = None
if 'coordenadas' in xml_dict['consulta_coordenadas'] and 'coord' in xml_dict['consulta_coordenadas']['coordenadas']:
pc1 = xml_dict['consulta_coordenadas']['coordenadas']['coord']['pc']['pc1'] if 'pc' in xml_dict['consulta_coordenadas']['coordenadas']['coord'] else None
pc2 = xml_dict['consulta_coordenadas']['coordenadas']['coord']['pc']['pc2'] if 'pc' in xml_dict['consulta_coordenadas']['coordenadas']['coord'] else None
if pc1 is not None and pc2 is not None:
logger.debug("||||| ] FOUND!")
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
pc1 = xml_dict_map.consulta_coordenadas.coordenadas.coord.pc.pc1
if pc1 == DotMap():
pc1 = None
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1,pc2]))
cadaster_entry = CadasterEntryXML(entry, x, y)
cadaster_entry.to_elasticsearch()
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time'])
pc2 = xml_dict_map.consulta_coordenadas.coordenadas.coord.pc.pc2
if pc2 == DotMap():
pc2 = None
if pc1 is not None and pc2 is not None:
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
picture = None
if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
# Parcela
if pictures:
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry.consulta_dnp.bico.bi.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
results.append(cadaster_entry)
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
parcelas = entry.consulta_dnp.lrcdnp.rcdnp
if not isinstance(parcelas, list):
parcelas = [parcelas]
for parcela in parcelas:
if pictures:
prov_num = parcela.dt.loine.cp
city_num = parcela.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
cadaster_entry = CadasterEntryXML(parcela, x, y, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
sleep(config['sleep_time'])
return results
@classmethod
def scrap_provinces(cls, prov_list, pictures=False):
"""Scraps properties by addresses"""
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
for province in provinces:
prov_name = province['np']
if len(prov_list) > 0 and prov_name not in prov_list:
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
if tv == DotMap() or nv == DotMap():
continue
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
for city in cities:
city_name = city['nm']
addresses = ScrapperXML.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][
'calle']
for address in addresses:
address_dir = address['dir']
tv = address_dir['tv']
nv = address_dir['nv']
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:
try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv)
if len(res) < 1:
num_scrapping_fails -= 1
else:
num_scrapping_fails = 10
sleep(config['sleep_time'])
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:
try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if 'lerr' in cadaster['consulta_numerero'] and \
'err' in cadaster['consulta_numerero']['lerr'] and \
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
num_scrapping_fails -= 1
else:
logger.debug("||||| ] FOUND!")
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, counter, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
numps = cadaster['consulta_numerero']['numerero']['nump']
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, counter, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
if not isinstance(numps, list):
numps = [numps]
counter += 1
sleep(config['sleep_time'])
for nump in numps:
num = nump['num']['pnp']
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2']
@classmethod
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv):
results = []
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
return results
coords = cls.get_coords_from_cadaster(prov_name, city_name,
cadaster_num)
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
numps = numerero_map.consulta_numerero.numerero.nump
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
if not isinstance(numps, list):
numps = [numps]
num_scrapping_fails = 10
for nump in numps:
if nump.num.pnp == DotMap():
continue
entry = cls.get_cadaster_entries_by_address(prov_name, city_name, tv,
nv, num)
num = nump.num.pnp
if 'bico' in entry['consulta_dnp']:
# Parcela
cadaster_entry = CadasterEntryXML(entry, lon, lat)
cadaster_entry.to_elasticsearch()
elif 'lrcdnp' in entry['consulta_dnp']:
# Multiparcela
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = cls.get_cadaster_entries_by_cadaster(prov_name,
city_name,
cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
if nump.pc == DotMap():
continue
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time'])
if nump.pc.pc1 == DotMap() or nump.pc.pc2 == DotMap():
continue
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
cadaster_num = nump.pc.pc1 + nump.pc.pc2
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
counter += 1
sleep(config['sleep_time'])
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap():
lon = None
lat = coords_map.consulta_coordenadas.coordenadas.coord.geo.ycen
if lat == DotMap():
lat = None
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
picture = None
if entry_map.consulta_dnp.bico != DotMap():
prov_num = entry_map.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
# Parcela
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
results.append(cadaster_entry)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
site_map = DotMap(site)
if site_map.rc == DotMap():
continue
cadaster = site_map.rc.pc1 + site_map.rc.pc2 + site_map.rc.car + site_map.rc.cc1 + site_map.rc.cc2
sub_entry = cls.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
prov_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cp
city_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cm
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)
results.append(cadaster_entry)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
return results

View File

@ -4,6 +4,7 @@ from urllib.request import urlopen
import requests
import xmltodict
from dotmap import DotMap
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
@ -36,7 +37,7 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
response = requests.get(url)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cities(cls, provincia, municipio=None):
@ -48,7 +49,7 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_addresses(cls, provincia, municipio, tipovia=None, nombrevia=None):
@ -66,7 +67,56 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_address_iter(cls, prov_list=None):
"""Scraps properties by addresses"""
if prov_list is None:
prov_list = []
provinces = cls.get_provinces().consulta_provinciero.provinciero.prov
if provinces == DotMap():
logger.error("No provinces available right now (Service is down?)")
yield None
for province in provinces:
prov_name = province.np
prov_num = province.cpine
if prov_name == DotMap() or prov_num == DotMap():
continue
if len(prov_list) > 0 and prov_name not in prov_list:
continue
cities = cls.get_cities(prov_name).consulta_municipiero.municipiero.muni
if cities == DotMap():
logger.error("No cities available right now (Service is down?)")
return
for city in cities:
city_name = city.nm
city_num = city.locat.cmc
if city_name == DotMap() or city_num == DotMap():
continue
addresses = cls.get_addresses(prov_name, city_name).consulta_callejero.callejero.calle
if addresses == DotMap():
logger.error("No addresses available right now (Service is down?)")
return
for address in addresses:
address_dir = address.dir
tv = address_dir.tv
nv = address_dir.nv
if tv == DotMap() or nv == DotMap():
continue
else:
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
@classmethod
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
@ -79,11 +129,11 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaNumero")
logger.debug("====Dir: {} {} {} {} {}====".format(tipovia, nombrevia, numero, municipio, provincia))
logger.debug("[||| ] URL for address: {}".format(url + '?' + urllib.parse.urlencode(params)))
logger.debug("URL for address: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
@ -111,11 +161,11 @@ class Scrapper:
params['Puerta'] = ''
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
@ -125,27 +175,28 @@ class Scrapper:
"RC": rc}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
logger.debug("[|||||||| ] URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def scrap_site_picture(cls, prov_name, city_name, cadaster):
url_pic = cls.URL_PICTURES.format(prov_name, city_name, cadaster, config['width_px'], config['height_px'])
def scrap_site_picture(cls, prov_num, city_num, cadaster):
logger.debug("[|||||||| ] URL for picture data: {}".format(url_pic))
url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
logger.debug("URL for picture data: {}".format(url_pic))
f_pic = urlopen(url_pic)

View File

@ -1,6 +1,9 @@
import unittest
from time import sleep
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
from src.settings import config
@ -8,23 +11,32 @@ from src.settings import config
class ScrapperXMLTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self):
self.assertEqual(ScrapperXML.get_provinces()['consulta_provinciero']['control']['cuprov'], '48')
self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_cities(self):
self.assertEqual(ScrapperXML.get_cities('ALACANT')['consulta_municipiero']['control']['cumun'],'141')
self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_addresses(self):
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST')['consulta_callejero']['control']['cuca'], '117')
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117')
sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_addresses_iter(self):
iterator = ScrapperXML.get_address_iter()
address = iterator.__next__()
self.assertEqual(address[1], '15')
self.assertEqual(address[3], '7')
sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry(self):
print(ScrapperXML.get_cadaster_entries_by_cadaster('','', '6375620YH0567S0001GW'))
dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
self.assertNotEqual(dotmap_res, DotMap())
sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
cadaster_entry = CadasterEntryXML(entry, None, None)
cadaster_entry = CadasterEntryXML(entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
@ -36,17 +48,20 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'ARZÓN'
num = 21
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
counter = 0
for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site.rc.pc1 + \
site.rc.pc2 + \
site.rc.car + \
site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
counter += 1
self.assertEqual(counter, 2)
def test_no_use_creates_entry_in_elasticsearch(self):
prov_name = u'A CORUÑA'
@ -55,14 +70,14 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'BARCALA'
num = 5
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site.rc.pc1 + \
site.rc.pc2 + \
site.rc.car + \
site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
@ -74,18 +89,43 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CASTELAO'
num = 1
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site.rc.pc1 + \
site.rc.pc2 + \
site.rc.car + \
site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
# CL BEATAS 4 MADRID ALCALA DE HENARES
prov_name = u'MADRID'
city_name = u'ALCALA DE HENARES'
tv = u'CL'
nv = u'BEATAS'
num = 4
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site.rc.pc1 + \
site.rc.pc2 + \
site.rc.car + \
site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_multiparcela_creates_n_entries(self):
lon = -9.2503
lat = 42.9723
self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2)
if __name__ == '__main__':
unittest.main()