diff --git a/README.md b/README.md index 7741c51..c0927a5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #libreCATASTRO An opensource, MIT-licensed application that scraps the official Spanish -Cadaster registry and stores information in Elastic Search. +Cadaster registry and stores information in Elastic Searcher. **Features** diff --git a/libreCadastro.py b/libreCadastro.py index 2655b61..79cb8ee 100644 --- a/libreCadastro.py +++ b/libreCadastro.py @@ -4,10 +4,10 @@ import sys import argparse -from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML -from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML -from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch -from src.librecatastro.scrapping.searchers.provinces_search import ProvincesSearch +from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML +from src.librecatastro.scrapping.parsers.parser_xml import ParserXML +from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher +from src.librecatastro.scrapping.searchers.provinces_searcher import ProvincesSearcher from src.settings import config if __name__ == "__main__": @@ -31,7 +31,7 @@ if __name__ == "__main__": if args.scale: config['scale'] = args.scale - scrapper = ScrapperHTML if args.html else ScrapperXML + scrapper = ScrapperHTML if args.html else ParserXML filenames = args.filenames pictures = args.pictures @@ -39,14 +39,14 @@ if __name__ == "__main__": startcity = args.startcity if args.listprovinces: - ProvincesSearch.list_provinces() + ProvincesSearcher.list_provinces() exit(0) if len(args.listcities) == 1: - ProvincesSearch.list_cities(args.listcities[0]) + ProvincesSearcher.list_cities(args.listcities[0]) exit(0) if args.coords: - CoordinatesSearch.scrap_coordinates(scrapper, filenames, pictures) + CoordinatesSearcher.search_by_coordinates(scrapper, filenames, pictures) else: - ProvincesSearch.scrap_provinces(scrapper, provinces, pictures, startcity) + ProvincesSearcher.search_by_provinces(scrapper, provinces, pictures, startcity) diff --git a/src/librecatastro/domain/address.py b/src/librecatastro/domain/address.py index ce8b0be..0753df7 100644 --- a/src/librecatastro/domain/address.py +++ b/src/librecatastro/domain/address.py @@ -11,7 +11,7 @@ logger = CadastroLogger(__name__).logger class Address: - """ Domain class for storing Address in Catastro format""" + """ Domain class for storing Address in Catastro parsers""" def __init__(self, address): self.full_address = address.strip() diff --git a/src/librecatastro/scrapping/parser.py b/src/librecatastro/scrapping/parser.py new file mode 100644 index 0000000..cb2fa8d --- /dev/null +++ b/src/librecatastro/scrapping/parser.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from src.utils.cadastro_logger import CadastroLogger + +'''Logger''' +logger = CadastroLogger(__name__).logger + + +class Parser: + """Generic Parser class""" + + def __init__(self): + pass + + ''' Processing signatures''' + @classmethod + def process_search_by_coordinates(cls, x, y, pictures=False): + pass + + @classmethod + def process_search_by_provinces(cls, prov_list, pictures=False): + pass \ No newline at end of file diff --git a/src/librecatastro/scrapping/format/__init__.py b/src/librecatastro/scrapping/parsers/__init__.py similarity index 100% rename from src/librecatastro/scrapping/format/__init__.py rename to src/librecatastro/scrapping/parsers/__init__.py diff --git a/src/librecatastro/scrapping/format/scrapper_html.py b/src/librecatastro/scrapping/parsers/parser_html.py similarity index 60% rename from src/librecatastro/scrapping/format/scrapper_html.py rename to src/librecatastro/scrapping/parsers/parser_html.py index a06ca4e..4b3ef4e 100644 --- a/src/librecatastro/scrapping/format/scrapper_html.py +++ b/src/librecatastro/scrapping/parsers/parser_html.py @@ -1,17 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import re import urllib.error from time import sleep -from urllib.request import urlopen from xml.etree import ElementTree -from bs4 import BeautifulSoup from dotmap import DotMap from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML +from src.librecatastro.scrapping.parser import Parser from src.librecatastro.scrapping.scrapper import Scrapper +from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML from src.settings import config from src.utils.cadastro_logger import CadastroLogger @@ -20,32 +19,23 @@ from src.utils.cadastro_logger import CadastroLogger logger = CadastroLogger(__name__).logger -class ScrapperHTML(Scrapper): - """Scrapper class for Catastro HTML""" +class ParserHTML(Parser): + """Parser class for Catastro HTML""" def __init__(self): super().__init__() - '''Catastro web services parametrized''' - URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}" - - URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}" - URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}" - '''Information to scrap from HTML''' description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción'] + gsurface_field_names = [u'Superficie gráfica'] - """ Scrapping calls """ - + """ Processing """ @classmethod - def scrap_coord(cls, x, y, pictures=False): - logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) - url = cls.URL.format(x, y) - logger.debug("URL for coordinates: {}".format(url)) - f = urlopen(url) - data = f.read() + def process_search_by_coordinates(cls, x, y, pictures=False): + data = ScrapperHTML.scrap_coord(x, y) + root = ElementTree.fromstring(data) pc1 = root.find( "{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1") @@ -55,17 +45,19 @@ class ScrapperHTML(Scrapper): results = [] if pc1 is not None and pc2 is not None: cadaster = ''.join([pc1.text, pc2.text]) - cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures) - for cadaster_entry in cadaster_entries: + htmls = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures) + for html, picture in htmls.items(): + cadaster_entry = cls.parse_html_parcela(html, x, y, picture) cadaster_entry.to_elasticsearch() results.append(cadaster_entry) return results @classmethod - def scrap_provinces(cls, prov_list, pictures=False, start_from=''): + def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''): - for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from): + num = '' + for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from): if tv == DotMap() or nv == DotMap(): continue @@ -74,7 +66,7 @@ class ScrapperHTML(Scrapper): counter = 1 while num_scrapping_fails > 0: try: - numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) + numerero_map = Scrapper.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) if numerero_map.consulta_numerero.lerr.err.cod != DotMap(): num_scrapping_fails -= 1 else: @@ -98,7 +90,7 @@ class ScrapperHTML(Scrapper): cadaster_num = nump.pc.pc1 + nump.pc.pc2 - coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num) + coords_map = Scrapper.get_coords_from_cadaster(prov_name, city_name, cadaster_num) lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen if lon == DotMap(): @@ -113,13 +105,13 @@ class ScrapperHTML(Scrapper): num_scrapping_fails = 10 - cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures) + htmls = ScrapperHTML.scrap_cadaster(cadaster_num, prov_num, city_num, pictures) - for cadaster in cadaster_list: - cadaster.to_elasticsearch() + for html, picture in htmls: + cadaster_entry = cls.parse_html_parcela(html, lon, lat, picture) + cadaster_entry.to_elasticsearch() counter += 1 - sleep(config['sleep_time']) except urllib.error.HTTPError as e: logger.error( @@ -141,72 +133,6 @@ class ScrapperHTML(Scrapper): num_scrapping_fails -= 1 counter += 1 - sleep(config['sleep_time']) - - - @classmethod - def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None): - url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio) - logger.debug("-->FULL URL for cadastral data: {}".format(url_ref)) - f_ref = urlopen(url_ref) - data_ref = f_ref.read() - html = str(data_ref.decode('utf-8')) - parsed_html = BeautifulSoup(html, features="html.parser") - return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture) - - @classmethod - def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False): - rc_1 = cadaster[0:7] - rc_2 = cadaster[7:14] - url_ref = cls.URL_REF.format(rc_1, rc_2) - - logger.debug("URL for cadastral data: {}".format(url_ref)) - - f_ref = urlopen(url_ref) - data_ref = f_ref.read() - html = str(data_ref.decode('utf-8')) - parsed_html = BeautifulSoup(html, features="html.parser") - - if delimitacion is None: - delimitacion_search = re.search(r'del=([0-9]+)&', html) - if delimitacion_search: - delimitacion = delimitacion_search.group(1) - - if municipio is None: - municipio_search = re.search(r'mun=([0-9]+)&', html) - if municipio_search: - municipio = municipio_search.group(1) - - picture = None - if pictures: - picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2])) - sleep(config['sleep_time']) - - description = parsed_html.find(id='ctl00_Contenido_tblInmueble') - - cadasters = [] - if description is None: - logger.debug("Multiparcela found!") - ''' Multiparcela with multiple cadasters ''' - - all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')}) - logger.debug("->Parcelas found: {}".format(len(all_cadasters))) - for partial_cadaster in all_cadasters: - partial_cadaster_ref = partial_cadaster.find("b") - logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text)) - partial_cadaster_text = partial_cadaster_ref.text.strip() - cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y, - picture) - cadasters.append(cadaster) - sleep(config['sleep_time']) - - else: - cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture) - - cadasters.append(cadaster) - - sleep(config['sleep_time']) - return cadasters """ Parsing """ @classmethod @@ -260,5 +186,6 @@ class ScrapperHTML(Scrapper): dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text, superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text)) + descriptive_data[u'GráficoParcela']=picture cadaster_entry = CadasterEntryHTML(descriptive_data) return cadaster_entry diff --git a/src/librecatastro/scrapping/format/scrapper_xml.py b/src/librecatastro/scrapping/parsers/parser_xml.py similarity index 57% rename from src/librecatastro/scrapping/format/scrapper_xml.py rename to src/librecatastro/scrapping/parsers/parser_xml.py index ae1c5e3..af79a9e 100644 --- a/src/librecatastro/scrapping/format/scrapper_xml.py +++ b/src/librecatastro/scrapping/parsers/parser_xml.py @@ -4,14 +4,13 @@ import urllib.parse from urllib import error -from time import sleep - import requests import xmltodict from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML +from src.librecatastro.scrapping.parser import Parser from src.librecatastro.scrapping.scrapper import Scrapper -from src.settings import config +from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML from src.utils.cadastro_logger import CadastroLogger from dotmap import DotMap @@ -20,28 +19,20 @@ from dotmap import DotMap logger = CadastroLogger(__name__).logger -class ScrapperXML(Scrapper): - """Scrapper class for Catastro XML""" +class ParserXML(Parser): + """Parser class for Catastro XML""" def __init__(self): super().__init__() - """ Scrapping main calls """ - + ''' Processing calls ''' @classmethod - def scrap_coord(cls, x, y, pictures=False): + def process_search_by_coordinates(cls, x, y, pictures=False): """Scraps properties by coordinates""" results = [] - params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y} - url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR") - response = requests.get(url, params=params) - - logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) - logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params))) - - xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)) + xml_dict_map = ScrapperXML.get_coord(x, y) pc1 = None pc2 = None if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap(): @@ -55,7 +46,7 @@ class ScrapperXML(Scrapper): if pc1 is not None and pc2 is not None: - entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2])) + entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2])) picture = None if entry.consulta_dnp.bico.bi.dt.loine != DotMap(): # Parcela @@ -63,42 +54,78 @@ class ScrapperXML(Scrapper): prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp city_num = entry.consulta_dnp.bico.bi.dt.cmc if prov_num != DotMap() and city_num != DotMap(): - picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2])) - cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture) + picture = Scrapper.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2])) + cadaster_entry = CadasterEntryXML(entry, x, y, picture) cadaster_entry.to_elasticsearch() - sleep(config['sleep_time']) results.append(cadaster_entry) elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap(): # Multiparcela parcelas = entry.consulta_dnp.lrcdnp.rcdnp if not isinstance(parcelas, list): parcelas = [parcelas] + for parcela in parcelas: + prov_num = parcela.dt.loine.cp + city_num = parcela.dt.cmc + cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else '' cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else '' cadaster += parcela.rc.car if parcela.rc.car != DotMap() else '' cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else '' cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else '' - if pictures: - prov_num = parcela.dt.loine.cp - city_num = parcela.dt.cmc - if prov_num != DotMap() and city_num != DotMap(): - picture = cls.scrap_site_picture(prov_num, city_num, cadaster) + if pictures and prov_num != DotMap() and city_num != DotMap(): + picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster) - parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster) - cadaster_entry = CadasterEntryXML(parcela, x, y, picture) + try: + # Try to get info by complete cadaster num + sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster) + except: + # Cadastro did not return anything by cadaster entry (error? bug?) + # Try to get it by complete address + prov_name = parcela.dt.np + if prov_name is DotMap(): + continue + city_name = parcela.dt.np + if city_name is DotMap(): + continue + tv = parcela.ldt.locs.lous.lourb.dir.tv + if tv is DotMap(): + tv = '' + nv = parcela.ldt.locs.lous.lourb.dir.nv + if nv is DotMap(): + nv = '' + num = parcela.ldt.locs.lous.lourb.dir.pnp + if num is DotMap(): + num = '' + + loint = parcela.dt.locs.lous.lourb.loint + if loint is DotMap(): + continue + bl = loint.bl + if bl == DotMap(): + bl = '' + es = loint.es + if es == DotMap(): + es = '' + pt = loint.pt + if es == DotMap(): + pt = '' + pu = loint.pu + if es == DotMap(): + pu = '' + sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu) + + cadaster_entry = CadasterEntryXML(sub_entry, x, y, picture) cadaster_entry.to_elasticsearch() results.append(cadaster_entry) - - sleep(config['sleep_time']) return results @classmethod - def scrap_provinces(cls, prov_list, pictures=False, start_from=''): - for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from): + def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''): + for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from): if tv == DotMap() or nv == DotMap(): continue @@ -106,13 +133,12 @@ class ScrapperXML(Scrapper): counter = 1 while num_scrapping_fails > 0: try: - cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) + cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures) if len(res) < 1: num_scrapping_fails -= 1 else: num_scrapping_fails = 10 - sleep(config['sleep_time']) except urllib.error.HTTPError as e: logger.error( @@ -123,7 +149,6 @@ class ScrapperXML(Scrapper): logger.error("=============================================") ''' Could be a service Unavailable or denegation of service''' num_scrapping_fails -= 1 - sleep(config['sleep_dos_time']) except Exception as e: logger.error( @@ -134,7 +159,8 @@ class ScrapperXML(Scrapper): num_scrapping_fails -= 1 counter += 1 - sleep(config['sleep_time']) + + ''' Parsing calls ''' @classmethod def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False): @@ -161,7 +187,7 @@ class ScrapperXML(Scrapper): cadaster_num = nump.pc.pc1 + nump.pc.pc2 - coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num) + coords_map = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num) lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen if lon == DotMap(): lon = None @@ -173,7 +199,7 @@ class ScrapperXML(Scrapper): ''' Adding to tracking file''' logger.info('{},{}'.format(lon, lat)) - entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) + entry_map = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) picture = None if entry_map.consulta_dnp.bico != DotMap(): @@ -181,14 +207,13 @@ class ScrapperXML(Scrapper): city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm if pictures and prov_num != DotMap() and city_num != DotMap(): - picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num) + picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster_num) # Parcela cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture) results.append(cadaster_entry) cadaster_entry.to_elasticsearch() - sleep(config['sleep_time']) elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap(): # Multiparcela for site in entry_map.consulta_dnp.lrcdnp.rcdnp: @@ -208,18 +233,38 @@ class ScrapperXML(Scrapper): cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else '' cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else '' - if pictures: - prov_num = parcela.dt.loine.cp - city_num = parcela.dt.cmc - if prov_num != DotMap() and city_num != DotMap(): - picture = cls.scrap_site_picture(prov_num, city_num, cadaster) + prov_num = parcela.dt.loine.cp + city_num = parcela.dt.cmc - parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster) - cadaster_entry = CadasterEntryXML(parcela, lon, lat, picture) + if pictures and prov_num != DotMap() and city_num != DotMap(): + picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster) + + try: + # Try to get info by complete cadaster num + sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster) + except: + # Cadastro did not return anything by cadaster entry (error? bug?) + # Try to get it by complete address + loint = parcela.dt.locs.lous.lourb.loint + if loint is DotMap(): + continue + bl = loint.bl + if bl == DotMap(): + bl = '' + es = loint.es + if es == DotMap(): + es = '' + pt = loint.pt + if es == DotMap(): + pt = '' + pu = loint.pu + if es == DotMap(): + pu = '' + sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu) + + cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture) cadaster_entry.to_elasticsearch() results.append(cadaster_entry) - sleep(config['sleep_time']) - return results diff --git a/src/librecatastro/scrapping/scrapper.py b/src/librecatastro/scrapping/scrapper.py index e9a4aa5..82c820c 100644 --- a/src/librecatastro/scrapping/scrapper.py +++ b/src/librecatastro/scrapping/scrapper.py @@ -1,9 +1,7 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - import base64 -import urllib.parse +from time import sleep from urllib.request import urlopen +import urllib.parse import requests import xmltodict @@ -17,29 +15,21 @@ logger = CadastroLogger(__name__).logger class Scrapper: - """Generic Scrapper class""" - - '''Catastro web services parametrized''' - URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}" + """Catastro web services parametrized""" URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}" + URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}" def __init__(self): pass - @classmethod - def scrap_coords(cls, x, y, pictures=False): - pass - - @classmethod - def scrap_provinces(cls, prov_list, pictures=False): - pass - @classmethod def get_provinces(cls): url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia") response = requests.get(url) xml = response.content + + sleep(config['sleep_time']) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) @classmethod @@ -52,6 +42,8 @@ class Scrapper: url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio") response = requests.get(url, params=params) xml = response.content + + sleep(config['sleep_time']) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) @classmethod @@ -70,6 +62,8 @@ class Scrapper: url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia") response = requests.get(url, params=params) xml = response.content + + sleep(config['sleep_time']) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) @classmethod @@ -125,6 +119,22 @@ class Scrapper: else: yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv) + @classmethod + def scrap_site_picture(cls, prov_num, city_num, cadaster): + + url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px']) + + logger.debug("URL for picture data: {}".format(url_pic)) + + f_pic = urlopen(url_pic) + + data_ref = f_pic.read() + + b64_image = base64.b64encode(data_ref).decode('utf-8') + + sleep(config['sleep_time']) + return b64_image + @classmethod def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero): params = {'Provincia': provincia, @@ -140,77 +150,20 @@ class Scrapper: response = requests.get(url, params=params) xml = response.content - return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) - @classmethod - def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None, - planta=None,puerta=None): - params = {'Provincia': provincia, - 'Municipio': municipio, - 'Sigla': sigla, - 'Calle': calle, - 'Numero': str(numero)} - if bloque: - params['Bloque'] = str(bloque) - else: - params['Bloque'] = '' - if escalera: - params['Escalera'] = escalera - else: - params['Escalera'] = '' - if planta: - params['Planta'] = str(planta) - else: - params['Planta'] = '' - if puerta: - params['Puerta'] = str(puerta) - else: - params['Puerta'] = '' - - url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC") - logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) - - response = requests.get(url, params=params) - xml = response.content - - return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) - - @classmethod - def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc): - """ provincia and municipio are optional and can be set to ''""" - params = {"Provincia": provincia, - "Municipio": municipio, - "RC": rc} - - url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC") - logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) - response = requests.get(url, params=params) - xml = response.content + sleep(config['sleep_time']) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) @classmethod def get_coords_from_cadaster(cls, provincia, municipio, cadaster): - params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster} + params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4326', 'RC': cadaster} url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC") - logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params))) + logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params))) response = requests.get(url, params=params) xml = response.content + + sleep(config['sleep_time']) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) - @classmethod - def scrap_site_picture(cls, prov_num, city_num, cadaster): - - url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px']) - - logger.debug("URL for picture data: {}".format(url_pic)) - - f_pic = urlopen(url_pic) - - data_ref = f_pic.read() - - b64_image = base64.b64encode(data_ref).decode('utf-8') - - return b64_image - diff --git a/src/templates/__init__.py b/src/librecatastro/scrapping/scrappers/__init__.py similarity index 100% rename from src/templates/__init__.py rename to src/librecatastro/scrapping/scrappers/__init__.py diff --git a/src/librecatastro/scrapping/scrappers/scrapper_html.py b/src/librecatastro/scrapping/scrappers/scrapper_html.py new file mode 100644 index 0000000..d8ac481 --- /dev/null +++ b/src/librecatastro/scrapping/scrappers/scrapper_html.py @@ -0,0 +1,99 @@ +import re +from time import sleep +from urllib.request import urlopen + +from bs4 import BeautifulSoup + +from src.librecatastro.scrapping.scrapper import Scrapper +from src.settings import config +from src.utils.cadastro_logger import CadastroLogger + +'''Logger''' +logger = CadastroLogger(__name__).logger + + +class ScrapperHTML(Scrapper): + """HTML Catastro Scrapper""" + + URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}" + URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}" + URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}" + + def __init__(self): + super().__init__() + + @classmethod + def scrap_coord(cls, x, y): + logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) + url = cls.URL.format(x, y) + logger.debug("URL for coordinates: {}".format(url)) + f = urlopen(url) + + sleep(config['sleep_time']) + return f.read() + + @classmethod + def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio): + url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio) + logger.debug("-->FULL URL for cadastral data: {}".format(url_ref)) + f_ref = urlopen(url_ref) + data_ref = f_ref.read() + html = str(data_ref.decode('utf-8')) + parsed_html = BeautifulSoup(html, features="html.parser") + + sleep(config['sleep_time']) + return parsed_html + + @classmethod + def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False): + rc_1 = cadaster[0:7] + rc_2 = cadaster[7:14] + url_ref = cls.URL_REF.format(rc_1, rc_2) + + logger.debug("URL for cadastral data: {}".format(url_ref)) + + f_ref = urlopen(url_ref) + data_ref = f_ref.read() + sleep(config['sleep_time']) + + html = str(data_ref.decode('utf-8')) + parsed_html = BeautifulSoup(html, features="html.parser") + + if delimitacion is None: + delimitacion_search = re.search(r'del=([0-9]+)&', html) + if delimitacion_search: + delimitacion = delimitacion_search.group(1) + + if municipio is None: + municipio_search = re.search(r'mun=([0-9]+)&', html) + if municipio_search: + municipio = municipio_search.group(1) + + description = parsed_html.find(id='ctl00_Contenido_tblInmueble') + + picture = None + if pictures: + picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2])) + sleep(config['sleep_time']) + + htmls = [] + if description is None: + # Multiparcela + logger.debug("Multiparcela found!") + ''' Multiparcela with multiple cadasters ''' + + all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')}) + logger.debug("->Parcelas found: {}".format(len(all_cadasters))) + for partial_cadaster in all_cadasters: + partial_cadaster_ref = partial_cadaster.find("b") + logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text)) + partial_cadaster_text = partial_cadaster_ref.text.strip() + html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio) + htmls.append((html, picture)) + sleep(config['sleep_time']) + else: + # Parcela + htmls.append((html, picture)) + + return htmls + diff --git a/src/librecatastro/scrapping/scrappers/scrapper_xml.py b/src/librecatastro/scrapping/scrappers/scrapper_xml.py new file mode 100644 index 0000000..4fc66e4 --- /dev/null +++ b/src/librecatastro/scrapping/scrappers/scrapper_xml.py @@ -0,0 +1,83 @@ +import urllib.parse +from time import sleep + +import requests +import xmltodict +from dotmap import DotMap + +from src.librecatastro.scrapping.scrapper import Scrapper +from src.settings import config +from src.utils.cadastro_logger import CadastroLogger + +'''Logger''' +logger = CadastroLogger(__name__).logger + + +class ScrapperXML(Scrapper): + + def __init__(self): + super().__init__() + + @classmethod + def get_coord(cls,x, y): + params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y} + url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR") + response = requests.get(url, params=params) + + logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) + logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params))) + + xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)) + + sleep(config['sleep_time']) + return xml_dict_map + + @classmethod + def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc): + """ provincia and municipio are optional and can be set to '' """ + + params = {"Provincia": provincia, + "Municipio": municipio, + "RC": rc} + + url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC") + logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) + response = requests.get(url, params=params) + xml = response.content + + sleep(config['sleep_time']) + return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) + + @classmethod + def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None, + planta=None,puerta=None): + params = {'Provincia': provincia, + 'Municipio': municipio, + 'Sigla': sigla, + 'Calle': calle, + 'Numero': str(numero)} + if bloque: + params['Bloque'] = str(bloque) + else: + params['Bloque'] = '' + if escalera: + params['Escalera'] = escalera + else: + params['Escalera'] = '' + if planta: + params['Planta'] = str(planta) + else: + params['Planta'] = '' + if puerta: + params['Puerta'] = str(puerta) + else: + params['Puerta'] = '' + + url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC") + logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params))) + + response = requests.get(url, params=params) + xml = response.content + + sleep(config['sleep_time']) + return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) \ No newline at end of file diff --git a/src/librecatastro/scrapping/search.py b/src/librecatastro/scrapping/searcher.py similarity index 83% rename from src/librecatastro/scrapping/search.py rename to src/librecatastro/scrapping/searcher.py index a071379..b3bfec1 100644 --- a/src/librecatastro/scrapping/search.py +++ b/src/librecatastro/scrapping/searcher.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -class Search: + +class Searcher: def __init__(self): pass diff --git a/src/librecatastro/scrapping/searchers/coordinates_search.py b/src/librecatastro/scrapping/searchers/coordinates_searcher.py similarity index 87% rename from src/librecatastro/scrapping/searchers/coordinates_search.py rename to src/librecatastro/scrapping/searchers/coordinates_searcher.py index 0999e27..cf57354 100644 --- a/src/librecatastro/scrapping/searchers/coordinates_search.py +++ b/src/librecatastro/scrapping/searchers/coordinates_searcher.py @@ -8,7 +8,7 @@ import random from time import sleep from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon -from src.librecatastro.scrapping.search import Search +from src.librecatastro.scrapping.searcher import Searcher from src.settings import config from src.utils.cadastro_logger import CadastroLogger from src.utils.list_utils import ListUtils @@ -17,12 +17,12 @@ from src.utils.list_utils import ListUtils logger = CadastroLogger(__name__).logger -class CoordinatesSearch(Search): +class CoordinatesSearcher(Searcher): def __init__(self): super().__init__() @classmethod - def scrap_coordinates(cls, scrapper, filenames, pictures=False): + def search_by_coordinates(cls, scrapper, filenames, pictures=False): for r, d, files in os.walk(config['coordinates_path']): for file in files: @@ -34,12 +34,12 @@ class CoordinatesSearch(Search): try: polygon = GeoPolygon(os.path.join(config['coordinates_path'], file)) - CoordinatesSearch.scrap_polygon(scrapper, polygon, pictures) + CoordinatesSearcher.search_in_polygon(scrapper, polygon, pictures) except: logger.error("{} is not formatted properly. Please take a look at the examples.".format(file)) @classmethod - def scrap_polygon(cls, scrapper, polygon, pictures=False): + def search_in_polygon(cls, scrapper, polygon, pictures=False): bb = polygon.get_bounding_box() lon_min = int(bb[0] * config['scale']) lon_max = int(bb[2] * config['scale']) @@ -57,7 +57,7 @@ class CoordinatesSearch(Search): logger.info('{},{}'.format(x_scaled, y_scaled)) try: - scrapper.scrap_coord(x_scaled, y_scaled, pictures) + scrapper.process_search_by_coordinates(x_scaled, y_scaled, pictures) except urllib.error.HTTPError as e: logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled)) @@ -76,7 +76,7 @@ class CoordinatesSearch(Search): sleep(config['sleep_time']) @staticmethod - def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper): + def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper): start_time = time.time() results = [] @@ -88,7 +88,7 @@ class CoordinatesSearch(Search): y_scaled = y / config['scale'] try: - result = scrapper.scrap_coord(x_scaled, y_scaled) + result = scrapper.process_search_by_coordinates(x_scaled, y_scaled) if result is not None: results.append(result) @@ -117,9 +117,9 @@ class CoordinatesSearch(Search): return ListUtils.flat(results) @staticmethod - def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper): + def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper): results = [] - counter = times + counter = matches finished = False for x in range(lon_min, lon_max): @@ -130,7 +130,7 @@ class CoordinatesSearch(Search): try: - result = scrapper.scrap_coord(x_scaled, y_scaled) + result = scrapper.process_search_by_coordinates(x_scaled, y_scaled) if result is not None: results.append(result) @@ -159,7 +159,7 @@ class CoordinatesSearch(Search): return ListUtils.flat(results) @staticmethod - def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper): + def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, scrapper): results = [] counter = times while counter > 0: @@ -170,7 +170,7 @@ class CoordinatesSearch(Search): y_scaled = y / config['scale'] try: - cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled) + cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled) if len(cadaster_entry) > 0: results.append(cadaster_entry) diff --git a/src/librecatastro/scrapping/searchers/provinces_search.py b/src/librecatastro/scrapping/searchers/provinces_search.py deleted file mode 100644 index d335b94..0000000 --- a/src/librecatastro/scrapping/searchers/provinces_search.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from dotmap import DotMap - -from src.librecatastro.scrapping.scrapper import Scrapper -from src.librecatastro.scrapping.search import Search -from src.utils.cadastro_logger import CadastroLogger - -'''Logger''' -logger = CadastroLogger(__name__).logger - - -class ProvincesSearch(Search): - def __init__(self): - super().__init__() - - @classmethod - def scrap_provinces(cls, scrapper, prov_list, pictures=False, start_from=''): - scrapper.scrap_provinces(prov_list, pictures, start_from) - - @classmethod - def list_provinces(cls): - logger.debug(DotMap.pprint(Scrapper.get_provinces())) - return - - @classmethod - def list_cities(cls, prov_name): - logger.debug(DotMap.pprint(Scrapper.get_cities(prov_name))) - return diff --git a/src/librecatastro/scrapping/searchers/provinces_searcher.py b/src/librecatastro/scrapping/searchers/provinces_searcher.py new file mode 100644 index 0000000..1bf1200 --- /dev/null +++ b/src/librecatastro/scrapping/searchers/provinces_searcher.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from dotmap import DotMap + +from src.librecatastro.scrapping.scrapper import Scrapper +from src.librecatastro.scrapping.searcher import Searcher +from src.utils.cadastro_logger import CadastroLogger + +'''Logger''' +logger = CadastroLogger(__name__).logger + + +class ProvincesSearcher(Searcher): + def __init__(self): + super().__init__() + + @classmethod + def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''): + scrapper.process_search_by_provinces(prov_list, pictures, start_from) + + @classmethod + def list_provinces(cls): + dotmap = Scrapper.get_provinces() + provinces = dotmap.consulta_provinciero.provinciero.prov + for province in provinces: + logger.debug(province.np) + + @classmethod + def list_cities(cls, prov_name): + dotmap = Scrapper.get_cities(prov_name) + cities = dotmap.consulta_municipiero.municipiero.muni + for city in cities: + logger.debug(city.nm) + return diff --git a/src/settings.py b/src/settings.py index 1444c06..b4c660a 100644 --- a/src/settings.py +++ b/src/settings.py @@ -16,5 +16,9 @@ config = { "sleep_time": 5, "sleep_dos_time": 300, "width_px": 120, - "height_px": 120 + "height_px": 120, + "servers_down_message": "Some of the Cadastro servers are down. " + "Maintenance is usually carried out durign the night or the weekends. Please, retry later." + "As an alternative, your IP address may have been banned. Try to change your public IP" + } diff --git a/src/templates/individual_address.xml b/src/templates/individual_address.xml deleted file mode 100644 index cd8d8bc..0000000 --- a/src/templates/individual_address.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - ####ADDRESS#### - - \ No newline at end of file diff --git a/src/templates/individual_cadaster.xml b/src/templates/individual_cadaster.xml deleted file mode 100644 index 90d502f..0000000 --- a/src/templates/individual_cadaster.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - ####CADASTER#### - \ No newline at end of file diff --git a/src/templates/individual_city.xml b/src/templates/individual_city.xml deleted file mode 100644 index 800f280..0000000 --- a/src/templates/individual_city.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - ####CITY#### - - \ No newline at end of file diff --git a/src/templates/individual_coord.xml b/src/templates/individual_coord.xml deleted file mode 100644 index b0cc4df..0000000 --- a/src/templates/individual_coord.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - ####COORDINATES#### - - \ No newline at end of file diff --git a/src/templates/individual_province.xml b/src/templates/individual_province.xml deleted file mode 100644 index 9a2b92a..0000000 --- a/src/templates/individual_province.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - ####PROVINCE#### - - \ No newline at end of file diff --git a/src/templates/ontology.owl b/src/templates/ontology.owl deleted file mode 100644 index f2c5dd0..0000000 --- a/src/templates/ontology.owl +++ /dev/null @@ -1,96 +0,0 @@ - - - - - - - - - - - - Thing - - - - - CadasterEntry - - - - - Address - - - - - Province - - - - - City - - - - - Geographical Coordinates - - - - - - ####INDIVIDUALS#### - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/tests/scrappers/__init__.py b/src/tests/scrappers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/scrapper_html_tests.py b/src/tests/scrappers/scrapper_html_tests.py similarity index 90% rename from src/tests/scrapper_html_tests.py rename to src/tests/scrappers/scrapper_html_tests.py index 1714b6f..198f7b5 100644 --- a/src/tests/scrapper_html_tests.py +++ b/src/tests/scrappers/scrapper_html_tests.py @@ -5,8 +5,8 @@ import os import unittest from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon -from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML -from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch +from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML +from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher from src.settings import config from src.utils.elasticsearch_utils import ElasticSearchUtils @@ -22,17 +22,17 @@ class ScrapperHTMLTests(unittest.TestCase): assert True def test_coordinate_creates_cadaster(self): - cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47) + cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47) self.assertEqual(len(cadaster_list), 1) cadaster = cadaster_list[0] self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK') def test_coordinate_multiparcela_creates_cadaster(self): - cadaster_list = ScrapperHTML.scrap_coord(-0.33, 39.47) + cadaster_list = ScrapperHTML.parse_coord(-0.33, 39.47) self.assertTrue(len(cadaster_list) > 1) def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self): - cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47) + cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47) self.assertEqual(len(cadaster_list), 1) cadaster = cadaster_list[0] cadaster.to_elasticsearch() @@ -92,7 +92,7 @@ class ScrapperHTMLTests(unittest.TestCase): def scrap_random_until_x_times_found(self, times): polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json')) coord = polygon.get_bounding_box() - cadaster_list = CoordinatesSearch.scrap_results_random_x_times(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML) + cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML) self.assertTrue(len(cadaster_list) >= times) return cadaster_list diff --git a/src/tests/scrapper_xml_tests.py b/src/tests/scrappers/scrapper_xml_tests.py similarity index 62% rename from src/tests/scrapper_xml_tests.py rename to src/tests/scrappers/scrapper_xml_tests.py index fca7405..33fa3d0 100644 --- a/src/tests/scrapper_xml_tests.py +++ b/src/tests/scrappers/scrapper_xml_tests.py @@ -5,44 +5,48 @@ import unittest from time import sleep -from dotmap import DotMap - from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML -from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML +from src.librecatastro.scrapping.parsers.parser_xml import ScrapperXML, ParserXML +from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML from src.settings import config class ScrapperXMLTests(unittest.TestCase): def test_scrapper_retrieves_dict_provinces(self): - self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48') - sleep(config['sleep_time']) + try: + self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48') + except: + self.assertFalse(config['servers_down_message']) + exit(-1) def test_scrapper_retrieves_dict_cities(self): - self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141') - sleep(config['sleep_time']) + try: + self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141') + except: + self.assertFalse(config['servers_down_message']) + exit(-1) def test_scrapper_retrieves_dict_addresses(self): - self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117') - sleep(config['sleep_time']) + try: + self.assertEqual(ScrapperXML.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117') + except: + self.assertFalse(config['servers_down_message']) + exit(-1) + + def test_get_cadaster_entries_by_cadaster_is_up(self): + cadasters = ['2503906VK4820D0001MX'] + try: + for cadaster in cadasters: + ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster) + except: + self.assertFalse(config['servers_down_message']) + exit(-1) def test_scrapper_retrieves_dict_addresses_iter(self): iterator = ScrapperXML.get_address_iter() address = iterator.__next__() self.assertEqual(address[1], '15') self.assertEqual(address[3], '7') - sleep(config['sleep_time']) - - def test_scrapper_creates_cadaster_entry(self): - dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW') - self.assertNotEqual(dotmap_res, DotMap()) - sleep(config['sleep_time']) - - def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self): - entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW') - cadaster_entry = CadasterEntryXML(entry) - cadaster_entry.to_elasticsearch() - self.assertIsNotNone(cadaster_entry.from_elasticsearch()) - sleep(config['sleep_time']) def test_multiparcela_creates_n_entries_in_elasticsearch(self): prov_name = u'A CORUÑA' @@ -127,7 +131,7 @@ class ScrapperXMLTests(unittest.TestCase): def test_multiparcela_coord_creates_n_entries(self): lon = -9.2503 lat = 42.9723 - self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2) + self.assertEqual(len(ParserXML.process_search_by_coordinates(lon, lat, True)), 2) def test_multiparcela_address_creates_n_entries(self): prov_name = u'MADRID' @@ -136,7 +140,7 @@ class ScrapperXMLTests(unittest.TestCase): nv = u'CANARIAS' num = 7 cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) - self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) + self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) def test_multiparcela_address_creates_n_entries_2(self): prov_name = u'MADRID' @@ -145,7 +149,39 @@ class ScrapperXMLTests(unittest.TestCase): nv = u'CALVARIO' num = 38 cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) - self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) + self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) + + def test_poligono_or_rural_creates_entry(self): + tv = 'CL' + nv = 'TORREJON' + num = 30 + prov_name = 'MADRID' + city_name = 'AJALVIR' + cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) + self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16) + + def test_coordinates_are_in_good_format(self): + tv = 'CL' + nv = 'DE BENICARLO' + num = 1 + prov_name = 'MADRID' + city_name = 'GALAPAGAR' + xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) + cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False) + self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374) + self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211) + + def test_multiparcela_coordinates_are_in_good_format(self): + tv = 'CL' + nv = 'SAN VICENTE' + num = 26 + prov_name = 'ALACANT' + city_name = 'ALICANTE/ALACANT' + xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) + cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False) + for cadaster_entry in cadaster_entries: + self.assertEqual(cadaster_entry.location.lat, 38.3495195831056) + self.assertEqual(cadaster_entry.location.lon, -0.484612452235845) if __name__ == '__main__': diff --git a/src/utils/elasticsearch_utils.py b/src/utils/elasticsearch_utils.py index 1d8924e..d930c52 100644 --- a/src/utils/elasticsearch_utils.py +++ b/src/utils/elasticsearch_utils.py @@ -6,7 +6,7 @@ logger = CadastroLogger(__name__).logger class ElasticSearchUtils: - """Custom class for managing Elastic Search queries""" + """Custom class for managing Elastic Searcher queries""" def __init__(self): pass diff --git a/src/utils/ontology_converter.py b/src/utils/ontology_converter.py deleted file mode 100644 index 95128d7..0000000 --- a/src/utils/ontology_converter.py +++ /dev/null @@ -1,77 +0,0 @@ -import copy -import re - - -class OntologyConverter: - - def __init__(self): - - with open("../templates/ontology.owl") as ont_f, \ - open("../templates/individual_city.xml") as ind_city_f, \ - open("../templates/individual_province.xml") as ind_province_f, \ - open("../templates/individual_coord.xml") as ind_coord_f, \ - open("../templates/individual_address.xml") as ind_address_f, \ - open("../templates/individual_cadaster.xml") as ind_cadaster_f: - - self.ont_template = ont_f.read() - self.city_template = ind_city_f.read() - self.province_template = ind_province_f.read() - self.coord_template = ind_coord_f.read() - self.address_template = ind_address_f.read() - self.cadaster_template = ind_cadaster_f.read() - - def cadastro_dict_to_ontology(self, cadastro_list): - - ont = copy.deepcopy(self.ont_template) - - for cadastro_entry in cadastro_list: - ont = ont.replace("####INDIVIDUALS####", ''.join(["####INDIVIDUALS####", - self.instantiate_individual(cadastro_entry)])) - - ont = ont.replace("####INDIVIDUALS####", '') - - return ont - - def instantiate_individual(self, cadastro_entry): - individuals = '' - - cadaster = '' - for header, value in cadastro_entry.items(): - if header == 'Referencia catastral': - txt = copy.deepcopy(self.cadaster_template) - txt = txt.replace("####CADASTER####", value) - individuals = ''.join([individuals, txt]) - cadaster = value - elif header == 'Localización': - city_txt = copy.deepcopy(self.city_template) - province_txt = copy.deepcopy(self.province_template) - address_txt = copy.deepcopy(self.address_template) - - cp = re.search(r'[0-9]{5}', value) - cp_span = cp.span() - cp_span_end = cp_span[1] - - city_text = value[cp_span_end:] - province = re.search(r'\(([^\)]+)\)', city_text) - province_span = province.span() - province_start = province_span[0] - province_end = province_span[1] - province_text = value[province_start:province_end] - - province_txt = province_txt.replace("####CADASTER####", cadaster) - province_txt = province_txt.replace("####PROVINCE####", province_text) - - city_txt = city_txt.replace("####CITY####", city_text) - city_txt = city_txt.replace("####PROVINCE####", province_text) - - address_txt = address_txt.replace("####ADDRESS####", value) - address_txt = address_txt.replace("####CITY####", city_text) - - individuals = ''.join([individuals, province_txt, city_txt, address_txt]) - - #print(individuals) - return individuals - - - -