Adds XML multiparcela. Fixes several bugs.

This commit is contained in:
J 2019-09-20 19:15:32 +02:00
parent ee90545bb6
commit d5b280f6eb
8 changed files with 431 additions and 243 deletions

View File

@ -34,5 +34,4 @@ if __name__ == "__main__":
if args.coords: if args.coords:
CoordinatesInput.scrap_coordinates(scrapper, filenames, pictures) CoordinatesInput.scrap_coordinates(scrapper, filenames, pictures)
else: else:
print(pictures)
ProvincesInput.scrap_provinces(scrapper, provinces, pictures) ProvincesInput.scrap_provinces(scrapper, provinces, pictures)

View File

@ -1,3 +1,4 @@
dotmap
shapely shapely
beautifulsoup4==4.8.0 beautifulsoup4==4.8.0
elasticsearch>=6.0.0,<7.0.0 elasticsearch>=6.0.0,<7.0.0

View File

@ -26,9 +26,10 @@ class CadasterEntry:
self.constructions = cadaster_entry.constructions self.constructions = cadaster_entry.constructions
self.picture = cadaster_entry.picture self.picture = cadaster_entry.picture
self.timestamp = cadaster_entry.timestamp self.timestamp = cadaster_entry.timestamp
logger.debug(self.to_json_recursive())
def to_json(self): def to_json(self):
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, picture=str(self.picture), timestamp=self.timestamp) return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, gsurface=self.gsurface, constructions=self.constructions, picture=str(self.picture) if self.picture is not None else None, timestamp=self.timestamp)
def to_json_recursive(self): def to_json_recursive(self):
return json.dumps(self.to_json(), cls=JSONEncoder, sort_keys=True, return json.dumps(self.to_json(), cls=JSONEncoder, sort_keys=True,
@ -43,8 +44,9 @@ class CadasterEntry:
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body) res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
finally:
es.transport.close() es.transport.close()
return res return res
def from_elasticsearch(self): def from_elasticsearch(self):
@ -55,7 +57,7 @@ class CadasterEntry:
res = es.search(index=config['elasticsearch-index'], body=query) res = es.search(index=config['elasticsearch-index'], body=query)
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
finally:
es.transport.close() es.transport.close()
return res return res

View File

@ -1,6 +1,8 @@
import json import json
from datetime import datetime from datetime import datetime
from dotmap import DotMap
from src.librecatastro.domain.address import Address from src.librecatastro.domain.address import Address
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
from src.librecatastro.domain.construction import Construction from src.librecatastro.domain.construction import Construction
@ -14,33 +16,48 @@ logger = CadastroLogger(__name__).logger
class CadasterEntryXML(CadasterEntry): class CadasterEntryXML(CadasterEntry):
"""Cadaster class, that stores all the information about a surface and its properties""" """Cadaster class, that stores all the information about a surface and its properties"""
def __init__(self, xml, lon, lat): def __init__(self, xml, lon=None, lat=None, picture=None):
self.address = None
if xml.consulta_dnp.bico.bi.ldt != DotMap():
self.address = Address(xml.consulta_dnp.bico.bi.ldt)
self.address = Address(xml['consulta_dnp']['bico']['bi']['ldt']) self.cadaster = xml.consulta_dnp.bico.bi.idbi.rc.pc1 if xml.consulta_dnp.bico.bi.idbi.rc.pc1 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.pc2 if xml.consulta_dnp.bico.bi.idbi.rc.pc2 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.car if xml.consulta_dnp.bico.bi.idbi.rc.car != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.cc1 if xml.consulta_dnp.bico.bi.idbi.rc.cc1 != DotMap() else ''
self.cadaster += xml.consulta_dnp.bico.bi.idbi.rc.cc2 if xml.consulta_dnp.bico.bi.idbi.rc.cc2 != DotMap() else ''
self.cadaster = xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc1'] + \ self.year = None
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \ if xml.consulta_dnp.bico.bi.debi is not None:
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \ self.year = xml.consulta_dnp.bico.bi.debi.ant
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \ if self.year == DotMap():
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2'] self.year = None
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant'] \ self.type = xml.consulta_dnp.bico.bi.idbi.cn
if 'debi' in xml['consulta_dnp']['bico']['bi'] and\ if self.type != DotMap() and self.type == 'UR':
'ant' in xml['consulta_dnp']['bico']['bi']['debi'] else None self.type = u'Urbano'
else:
self.type = u'Rústico'
self.type = xml['consulta_dnp']['bico']['bi']['idbi']['cn'] if 'cn' in xml['consulta_dnp']['bico']['bi']['idbi'] else None self.use = None
if self.type is not None: if xml.consulta_dnp.bico.bi.debi is not None:
self.type = u'Urbano' if self.type == 'UR' else u'Rústico' self.use = xml.consulta_dnp.bico.bi.debi.luso
if self.use == DotMap():
self.use = None
self.surface = None
if xml.consulta_dnp.bico.bi.debi is not None:
self.surface = xml.consulta_dnp.bico.bi.debi.sfc + 'm2'
if self.surface == DotMap():
self.surface = None
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso'] if 'luso' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2' if 'sfc' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.location = Location(lon, lat) self.location = Location(lon, lat)
self.gsurface = config['not_available_via_XML'] self.gsurface = config['not_available_via_XML']
self.constructions = [] self.constructions = []
constructions = [] constructions = []
if 'lcons' in xml['consulta_dnp']['bico']: if xml.consulta_dnp.bico.lcons.cons != DotMap():
constructions = xml['consulta_dnp']['bico']['lcons']['cons'] constructions = xml.consulta_dnp.bico.lcons.cons
''' Bad XML design, instead of returning a list with 1 element, it returns ''' Bad XML design, instead of returning a list with 1 element, it returns
the element''' the element'''
@ -48,11 +65,26 @@ class CadasterEntryXML(CadasterEntry):
constructions = [constructions] constructions = [constructions]
for construction in constructions: for construction in constructions:
use = construction['lcd'] if 'lcd' in construction else None use = construction.lcd
doorway = construction['dt']['lourb']['loint']['es'] if 'dt' in construction else None if use == DotMap():
floor = construction['dt']['lourb']['loint']['pt'] if 'dt' in construction else None use = None
door = construction['dt']['lourb']['loint']['pu'] if 'dt' in construction else None
surface = construction['dfcons']['stl'] if 'dfcons' in construction and 'stl' in construction['dfcons'] else None doorway = construction.dt.lourb.loint.es
if doorway == DotMap():
doorway = None
floor = construction.dt.lourb.loint.pt
if floor == DotMap():
floor = None
door = construction.dt.lourb.loint.pu
if door == DotMap():
door = None
surface = construction.dfcons.stl
if surface == DotMap():
surface = None
reform_type = config['not_available_via_XML'] reform_type = config['not_available_via_XML']
reform_date = config['not_available_via_XML'] reform_date = config['not_available_via_XML']
@ -60,5 +92,6 @@ class CadasterEntryXML(CadasterEntry):
dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type, dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type,
fecha=reform_date))) fecha=reform_date)))
self.picture = picture
self.timestamp = str(datetime.now()) self.timestamp = str(datetime.now())
super().__init__(self) super().__init__(self)

View File

@ -5,6 +5,7 @@ from time import sleep
from urllib.request import urlopen from urllib.request import urlopen
from xml.etree import ElementTree from xml.etree import ElementTree
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.librecatastro.scrapping.scrapper import Scrapper from src.librecatastro.scrapping.scrapper import Scrapper
@ -12,7 +13,6 @@ from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
'''Logger''' '''Logger'''
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
@ -30,15 +30,17 @@ class ScrapperHTML(Scrapper):
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}" URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
'''Information to scrap from HTML''' '''Information to scrap from HTML'''
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción'] description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
u'Superficie construida', u'Año construcción']
gsurface_field_names = [u'Superficie gráfica'] gsurface_field_names = [u'Superficie gráfica']
""" Scrapping calls """ """ Scrapping calls """
@classmethod @classmethod
def scrap_coord(cls, x, y, pictures=False): def scrap_coord(cls, x, y, pictures=False):
logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
url = cls.URL.format(x, y) url = cls.URL.format(x, y)
logger.debug("[||| ] URL for coordinates: {}".format(url)) logger.debug("URL for coordinates: {}".format(url))
f = urlopen(url) f = urlopen(url)
data = f.read() data = f.read()
root = ElementTree.fromstring(data) root = ElementTree.fromstring(data)
@ -46,100 +48,99 @@ class ScrapperHTML(Scrapper):
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1") "{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
pc2 = root.find( pc2 = root.find(
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc2") "{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc2")
if pc1 is None or pc2 is None:
return [] results = []
else: if pc1 is not None and pc2 is not None:
logger.debug("||||| ] FOUND!")
cadaster = ''.join([pc1.text, pc2.text]) cadaster = ''.join([pc1.text, pc2.text])
cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures) cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures)
for cadaster_entry in cadaster_entries: for cadaster_entry in cadaster_entries:
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
return cadaster_entries results.append(cadaster_entry)
return results
@classmethod @classmethod
def scrap_provinces(cls, prov_list, pictures=False): def scrap_provinces(cls, prov_list, pictures=False):
"""Scraps properties by addresses"""
provinces = cls.get_provinces()['consulta_provinciero']['provinciero']['prov']
for province in provinces: for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
prov_name = province['np']
prov_num = province['cpine']
if len(prov_list) > 0 and prov_name not in prov_list: if tv == DotMap() or nv == DotMap():
continue continue
cities = cls.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni'] num_scrapping_fails = 10
for city in cities: counter = 1
city_name = city['nm'] while num_scrapping_fails > 0:
city_num = city['locat']['cmc'] try:
addresses = (cls.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][ numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
'calle']) if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
num_scrapping_fails -= 1
else:
for address in addresses: numps = numerero_map.consulta_numerero.numerero.nump
address_dir = address['dir']
tv = address_dir['tv']
nv = address_dir['nv']
num_scrapping_fails = 10 if not isinstance(numps, list):
counter = 1 numps = [numps]
while num_scrapping_fails > 0:
try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if 'lerr' in cadaster['consulta_numerero'] and \
'err' in cadaster['consulta_numerero']['lerr'] and \
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
num_scrapping_fails -= 1
else:
logger.debug("||||| ] FOUND!")
numps = cadaster['consulta_numerero']['numerero']['nump']
if not isinstance(numps, list): for nump in numps:
numps = [numps] if nump.num.pnp == DotMap():
continue
for nump in numps: num = nump.num.pnp
num = nump['num']['pnp']
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2']
coords = cls.get_coords_from_cadaster(prov_name, city_name,cadaster_num) if nump.pc == DotMap():
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen'] continue
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
''' Adding to tracking file''' if nump.pc.pc1 == DotMap() or nump.pc.pc2 == DotMap():
logger.info('{},{}'.format(lon, lat)) continue
num_scrapping_fails = 10 cadaster_num = nump.pc.pc1 + nump.pc.pc2
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures) coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
for cadaster in cadaster_list: lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
cadaster.to_elasticsearch() if lon == DotMap():
lon = None
counter += 1 lat = coords_map.consulta_coordenadas.coordenadas.coord.geo.ycen
sleep(config['sleep_time']) if lat == DotMap():
lat = None
except urllib.error.HTTPError as e: ''' Adding to tracking file'''
logger.error( logger.info('{},{}'.format(lon, lat))
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
except Exception as e: num_scrapping_fails = 10
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name)) cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
logger.error("=============================================")
logger.error(e, exc_info=True) for cadaster in cadaster_list:
logger.error("=============================================") cadaster.to_elasticsearch()
num_scrapping_fails -= 1
counter += 1 counter += 1
sleep(config['sleep_time']) sleep(config['sleep_time'])
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
counter += 1
sleep(config['sleep_time'])
@classmethod @classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None): def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio) url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
@ -150,13 +151,14 @@ class ScrapperHTML(Scrapper):
parsed_html = BeautifulSoup(html, features="html.parser") parsed_html = BeautifulSoup(html, features="html.parser")
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture) return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
@classmethod @classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False): def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
rc_1 = cadaster[0:7] rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14] rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2) url_ref = cls.URL_REF.format(rc_1, rc_2)
logger.debug("[|||||||| ] URL for cadastral data: {}".format(url_ref)) logger.debug("URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref) f_ref = urlopen(url_ref)
data_ref = f_ref.read() data_ref = f_ref.read()
@ -191,7 +193,8 @@ class ScrapperHTML(Scrapper):
partial_cadaster_ref = partial_cadaster.find("b") partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text)) logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip() partial_cadaster_text = partial_cadaster_ref.text.strip()
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y, picture) cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y,
picture)
cadasters.append(cadaster) cadasters.append(cadaster)
sleep(config['sleep_time']) sleep(config['sleep_time'])
@ -200,12 +203,10 @@ class ScrapperHTML(Scrapper):
cadasters.append(cadaster) cadasters.append(cadaster)
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time']) sleep(config['sleep_time'])
return cadasters return cadasters
""" Parsing """ """ Parsing """
@classmethod @classmethod
def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None): def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None):
description = parsed_html.find(id='ctl00_Contenido_tblInmueble') description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
@ -229,7 +230,7 @@ class ScrapperHTML(Scrapper):
descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0] descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0]
descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0] descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0]
elif field_header.text == u'Localización': elif field_header.text == u'Localización':
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>', config['separator']).replace('<br>', config['separator']) descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',config['separator']).replace('<br>', config['separator'])
'''Graphical Surface''' '''Graphical Surface'''
fields = parsed_html.find(id='ctl00_Contenido_tblFinca').find_all('div') fields = parsed_html.find(id='ctl00_Contenido_tblFinca').find_all('div')
@ -253,7 +254,9 @@ class ScrapperHTML(Scrapper):
continue continue
columns = construction.find_all('span') columns = construction.find_all('span')
descriptive_data[u'Construcciones'].append(dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text, superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text)) descriptive_data[u'Construcciones'].append(
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
cadaster_entry = CadasterEntryHTML(descriptive_data) cadaster_entry = CadasterEntryHTML(descriptive_data)
return cadaster_entry return cadaster_entry

View File

@ -1,4 +1,3 @@
import json
import urllib.parse import urllib.parse
from urllib import error from urllib import error
@ -12,6 +11,8 @@ from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap
'''Logger''' '''Logger'''
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
@ -20,133 +21,191 @@ class ScrapperXML(Scrapper):
"""Scrapper class for Catastro XML""" """Scrapper class for Catastro XML"""
def __init__(self): def __init__(self):
pass super().__init__()
""" Scrapping main calls """ """ Scrapping main calls """
@classmethod @classmethod
def scrap_coord(cls, x, y, pictures=False): def scrap_coord(cls, x, y, pictures=False):
"""Scraps properties by coordinates""" """Scraps properties by coordinates"""
results = []
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y} params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR") url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params) response = requests.get(url, params=params)
logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
logger.debug("[||| ] URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
xml = response.content xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
xml_dict = xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
pc1 = None pc1 = None
pc2 = None pc2 = None
if 'coordenadas' in xml_dict['consulta_coordenadas'] and 'coord' in xml_dict['consulta_coordenadas']['coordenadas']: if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
pc1 = xml_dict['consulta_coordenadas']['coordenadas']['coord']['pc']['pc1'] if 'pc' in xml_dict['consulta_coordenadas']['coordenadas']['coord'] else None pc1 = xml_dict_map.consulta_coordenadas.coordenadas.coord.pc.pc1
pc2 = xml_dict['consulta_coordenadas']['coordenadas']['coord']['pc']['pc2'] if 'pc' in xml_dict['consulta_coordenadas']['coordenadas']['coord'] else None if pc1 == DotMap():
if pc1 is not None and pc2 is not None: pc1 = None
logger.debug("||||| ] FOUND!")
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1,pc2])) pc2 = xml_dict_map.consulta_coordenadas.coordenadas.coord.pc.pc2
cadaster_entry = CadasterEntryXML(entry, x, y) if pc2 == DotMap():
cadaster_entry.to_elasticsearch() pc2 = None
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time']) if pc1 is not None and pc2 is not None:
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
picture = None
if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
# Parcela
if pictures:
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry.consulta_dnp.bico.bi.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
results.append(cadaster_entry)
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
parcelas = entry.consulta_dnp.lrcdnp.rcdnp
if not isinstance(parcelas, list):
parcelas = [parcelas]
for parcela in parcelas:
if pictures:
prov_num = parcela.dt.loine.cp
city_num = parcela.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
cadaster_entry = CadasterEntryXML(parcela, x, y, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
sleep(config['sleep_time'])
return results
@classmethod @classmethod
def scrap_provinces(cls, prov_list, pictures=False): def scrap_provinces(cls, prov_list, pictures=False):
"""Scraps properties by addresses""" for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
if tv == DotMap() or nv == DotMap():
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
for province in provinces:
prov_name = province['np']
if len(prov_list) > 0 and prov_name not in prov_list:
continue continue
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni'] num_scrapping_fails = 10
for city in cities: counter = 1
city_name = city['nm'] while num_scrapping_fails > 0:
addresses = ScrapperXML.get_addresses(prov_name, city_name)['consulta_callejero']['callejero'][ try:
'calle'] cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
for address in addresses: res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv)
address_dir = address['dir'] if len(res) < 1:
tv = address_dir['tv'] num_scrapping_fails -= 1
nv = address_dir['nv'] else:
num_scrapping_fails = 10
sleep(config['sleep_time'])
num_scrapping_fails = 10 except urllib.error.HTTPError as e:
counter = 1 logger.error(
while num_scrapping_fails > 0: "ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, counter, prov_name, city_name))
try: logger.error("=============================================")
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) logger.error(e, exc_info=True)
if 'lerr' in cadaster['consulta_numerero'] and \ logger.error("...sleeping...")
'err' in cadaster['consulta_numerero']['lerr'] and \ logger.error("=============================================")
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \ ''' Could be a service Unavailable or denegation of service'''
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43': num_scrapping_fails -= 1
num_scrapping_fails -= 1 sleep(config['sleep_dos_time'])
else:
logger.debug("||||| ] FOUND!")
numps = cadaster['consulta_numerero']['numerero']['nump'] except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, counter, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
if not isinstance(numps, list): counter += 1
numps = [numps] sleep(config['sleep_time'])
for nump in numps: @classmethod
num = nump['num']['pnp'] def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv):
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2'] results = []
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
return results
coords = cls.get_coords_from_cadaster(prov_name, city_name, numps = numerero_map.consulta_numerero.numerero.nump
cadaster_num)
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
''' Adding to tracking file''' if not isinstance(numps, list):
logger.info('{},{}'.format(lon, lat)) numps = [numps]
num_scrapping_fails = 10 for nump in numps:
if nump.num.pnp == DotMap():
continue
entry = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, num = nump.num.pnp
nv, num)
if 'bico' in entry['consulta_dnp']: if nump.pc == DotMap():
# Parcela continue
cadaster_entry = CadasterEntryXML(entry, lon, lat)
cadaster_entry.to_elasticsearch()
elif 'lrcdnp' in entry['consulta_dnp']:
# Multiparcela
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = cls.get_cadaster_entries_by_cadaster(prov_name,
city_name,
cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
logger.debug("[|||||||||||] SUCCESS!") if nump.pc.pc1 == DotMap() or nump.pc.pc2 == DotMap():
sleep(config['sleep_time']) continue
except urllib.error.HTTPError as e: cadaster_num = nump.pc.pc1 + nump.pc.pc2
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
except Exception as e: coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
counter += 1 lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
sleep(config['sleep_time']) if lon == DotMap():
lon = None
lat = coords_map.consulta_coordenadas.coordenadas.coord.geo.ycen
if lat == DotMap():
lat = None
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
picture = None
if entry_map.consulta_dnp.bico != DotMap():
prov_num = entry_map.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
# Parcela
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
results.append(cadaster_entry)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
site_map = DotMap(site)
if site_map.rc == DotMap():
continue
cadaster = site_map.rc.pc1 + site_map.rc.pc2 + site_map.rc.car + site_map.rc.cc1 + site_map.rc.cc2
sub_entry = cls.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
prov_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cp
city_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cm
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)
results.append(cadaster_entry)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
return results

View File

@ -4,6 +4,7 @@ from urllib.request import urlopen
import requests import requests
import xmltodict import xmltodict
from dotmap import DotMap
from src.settings import config from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
@ -36,7 +37,7 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
response = requests.get(url) response = requests.get(url)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_cities(cls, provincia, municipio=None): def get_cities(cls, provincia, municipio=None):
@ -48,7 +49,7 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_addresses(cls, provincia, municipio, tipovia=None, nombrevia=None): def get_addresses(cls, provincia, municipio, tipovia=None, nombrevia=None):
@ -66,7 +67,56 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_address_iter(cls, prov_list=None):
"""Scraps properties by addresses"""
if prov_list is None:
prov_list = []
provinces = cls.get_provinces().consulta_provinciero.provinciero.prov
if provinces == DotMap():
logger.error("No provinces available right now (Service is down?)")
yield None
for province in provinces:
prov_name = province.np
prov_num = province.cpine
if prov_name == DotMap() or prov_num == DotMap():
continue
if len(prov_list) > 0 and prov_name not in prov_list:
continue
cities = cls.get_cities(prov_name).consulta_municipiero.municipiero.muni
if cities == DotMap():
logger.error("No cities available right now (Service is down?)")
return
for city in cities:
city_name = city.nm
city_num = city.locat.cmc
if city_name == DotMap() or city_num == DotMap():
continue
addresses = cls.get_addresses(prov_name, city_name).consulta_callejero.callejero.calle
if addresses == DotMap():
logger.error("No addresses available right now (Service is down?)")
return
for address in addresses:
address_dir = address.dir
tv = address_dir.tv
nv = address_dir.nv
if tv == DotMap() or nv == DotMap():
continue
else:
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
@classmethod @classmethod
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero): def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
@ -79,11 +129,11 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaNumero") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaNumero")
logger.debug("====Dir: {} {} {} {} {}====".format(tipovia, nombrevia, numero, municipio, provincia)) logger.debug("====Dir: {} {} {} {} {}====".format(tipovia, nombrevia, numero, municipio, provincia))
logger.debug("[||| ] URL for address: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for address: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None, def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
@ -111,11 +161,11 @@ class Scrapper:
params['Puerta'] = '' params['Puerta'] = ''
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc): def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
@ -125,27 +175,28 @@ class Scrapper:
"RC": rc} "RC": rc}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_coords_from_cadaster(cls, provincia, municipio, cadaster): def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster} params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC") url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
logger.debug("[|||||||| ] URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def scrap_site_picture(cls, prov_name, city_name, cadaster): def scrap_site_picture(cls, prov_num, city_num, cadaster):
url_pic = cls.URL_PICTURES.format(prov_name, city_name, cadaster, config['width_px'], config['height_px'])
logger.debug("[|||||||| ] URL for picture data: {}".format(url_pic)) url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
logger.debug("URL for picture data: {}".format(url_pic))
f_pic = urlopen(url_pic) f_pic = urlopen(url_pic)

View File

@ -1,6 +1,9 @@
import unittest import unittest
from time import sleep from time import sleep
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
from src.settings import config from src.settings import config
@ -8,23 +11,32 @@ from src.settings import config
class ScrapperXMLTests(unittest.TestCase): class ScrapperXMLTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self): def test_scrapper_retrieves_dict_provinces(self):
self.assertEqual(ScrapperXML.get_provinces()['consulta_provinciero']['control']['cuprov'], '48') self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_cities(self): def test_scrapper_retrieves_dict_cities(self):
self.assertEqual(ScrapperXML.get_cities('ALACANT')['consulta_municipiero']['control']['cumun'],'141') self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_addresses(self): def test_scrapper_retrieves_dict_addresses(self):
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST')['consulta_callejero']['control']['cuca'], '117') self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117')
sleep(config['sleep_time'])
def test_scrapper_retrieves_dict_addresses_iter(self):
iterator = ScrapperXML.get_address_iter()
address = iterator.__next__()
self.assertEqual(address[1], '15')
self.assertEqual(address[3], '7')
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry(self): def test_scrapper_creates_cadaster_entry(self):
print(ScrapperXML.get_cadaster_entries_by_cadaster('','', '6375620YH0567S0001GW')) dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
self.assertNotEqual(dotmap_res, DotMap())
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self): def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW') entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
cadaster_entry = CadasterEntryXML(entry, None, None) cadaster_entry = CadasterEntryXML(entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
@ -36,17 +48,20 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'ARZÓN' nv = u'ARZÓN'
num = 21 num = 21
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']: counter = 0
cadaster = site['rc']['pc1'] + \ for site in entry.consulta_dnp.lrcdnp.rcdnp:
site['rc']['pc2'] + \ cadaster = site.rc.pc1 + \
site['rc']['car'] + \ site.rc.pc2 + \
site['rc']['cc1'] + \ site.rc.car + \
site['rc']['cc2'] site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
counter += 1
self.assertEqual(counter, 2)
def test_no_use_creates_entry_in_elasticsearch(self): def test_no_use_creates_entry_in_elasticsearch(self):
prov_name = u'A CORUÑA' prov_name = u'A CORUÑA'
@ -55,14 +70,14 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'BARCALA' nv = u'BARCALA'
num = 5 num = 5
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']: for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site['rc']['pc1'] + \ cadaster = site.rc.pc1 + \
site['rc']['pc2'] + \ site.rc.pc2 + \
site['rc']['car'] + \ site.rc.car + \
site['rc']['cc1'] + \ site.rc.cc1 + \
site['rc']['cc2'] site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
@ -74,18 +89,43 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CASTELAO' nv = u'CASTELAO'
num = 1 num = 1
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']: for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site['rc']['pc1'] + \ cadaster = site.rc.pc1 + \
site['rc']['pc2'] + \ site.rc.pc2 + \
site['rc']['car'] + \ site.rc.car + \
site['rc']['cc1'] + \ site.rc.cc1 + \
site['rc']['cc2'] site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
# CL BEATAS 4 MADRID ALCALA DE HENARES
prov_name = u'MADRID'
city_name = u'ALCALA DE HENARES'
tv = u'CL'
nv = u'BEATAS'
num = 4
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry.consulta_dnp.lrcdnp.rcdnp:
cadaster = site.rc.pc1 + \
site.rc.pc2 + \
site.rc.car + \
site.rc.cc1 + \
site.rc.cc2
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_multiparcela_creates_n_entries(self):
lon = -9.2503
lat = 42.9723
self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()