Refactors and disgregates scrapping and parsing into different classes for maintenability

This commit is contained in:
J 2019-09-21 15:11:32 +02:00
parent fef84a9f95
commit 7cf208a4c2
27 changed files with 485 additions and 506 deletions

View File

@ -1,6 +1,6 @@
#libreCATASTRO #libreCATASTRO
An opensource, MIT-licensed application that scraps the official Spanish An opensource, MIT-licensed application that scraps the official Spanish
Cadaster registry and stores information in Elastic Search. Cadaster registry and stores information in Elastic Searcher.
**Features** **Features**

View File

@ -4,10 +4,10 @@
import sys import sys
import argparse import argparse
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.librecatastro.scrapping.searchers.provinces_search import ProvincesSearch from src.librecatastro.scrapping.searchers.provinces_searcher import ProvincesSearcher
from src.settings import config from src.settings import config
if __name__ == "__main__": if __name__ == "__main__":
@ -31,7 +31,7 @@ if __name__ == "__main__":
if args.scale: if args.scale:
config['scale'] = args.scale config['scale'] = args.scale
scrapper = ScrapperHTML if args.html else ScrapperXML scrapper = ScrapperHTML if args.html else ParserXML
filenames = args.filenames filenames = args.filenames
pictures = args.pictures pictures = args.pictures
@ -39,14 +39,14 @@ if __name__ == "__main__":
startcity = args.startcity startcity = args.startcity
if args.listprovinces: if args.listprovinces:
ProvincesSearch.list_provinces() ProvincesSearcher.list_provinces()
exit(0) exit(0)
if len(args.listcities) == 1: if len(args.listcities) == 1:
ProvincesSearch.list_cities(args.listcities[0]) ProvincesSearcher.list_cities(args.listcities[0])
exit(0) exit(0)
if args.coords: if args.coords:
CoordinatesSearch.scrap_coordinates(scrapper, filenames, pictures) CoordinatesSearcher.search_by_coordinates(scrapper, filenames, pictures)
else: else:
ProvincesSearch.scrap_provinces(scrapper, provinces, pictures, startcity) ProvincesSearcher.search_by_provinces(scrapper, provinces, pictures, startcity)

View File

@ -11,7 +11,7 @@ logger = CadastroLogger(__name__).logger
class Address: class Address:
""" Domain class for storing Address in Catastro format""" """ Domain class for storing Address in Catastro parsers"""
def __init__(self, address): def __init__(self, address):
self.full_address = address.strip() self.full_address = address.strip()

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class Parser:
"""Generic Parser class"""
def __init__(self):
pass
''' Processing signatures'''
@classmethod
def process_search_by_coordinates(cls, x, y, pictures=False):
pass
@classmethod
def process_search_by_provinces(cls, prov_list, pictures=False):
pass

View File

@ -1,17 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import urllib.error import urllib.error
from time import sleep from time import sleep
from urllib.request import urlopen
from xml.etree import ElementTree from xml.etree import ElementTree
from bs4 import BeautifulSoup
from dotmap import DotMap from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.librecatastro.scrapping.parser import Parser
from src.librecatastro.scrapping.scrapper import Scrapper from src.librecatastro.scrapping.scrapper import Scrapper
from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
from src.settings import config from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
@ -20,32 +19,23 @@ from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
class ScrapperHTML(Scrapper): class ParserHTML(Parser):
"""Scrapper class for Catastro HTML""" """Parser class for Catastro HTML"""
def __init__(self): def __init__(self):
super().__init__() super().__init__()
'''Catastro web services parametrized'''
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
'''Information to scrap from HTML''' '''Information to scrap from HTML'''
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
u'Superficie construida', u'Año construcción'] u'Superficie construida', u'Año construcción']
gsurface_field_names = [u'Superficie gráfica'] gsurface_field_names = [u'Superficie gráfica']
""" Scrapping calls """ """ Processing """
@classmethod @classmethod
def scrap_coord(cls, x, y, pictures=False): def process_search_by_coordinates(cls, x, y, pictures=False):
logger.debug("====Longitude: {} Latitude: {}====".format(x, y)) data = ScrapperHTML.scrap_coord(x, y)
url = cls.URL.format(x, y)
logger.debug("URL for coordinates: {}".format(url))
f = urlopen(url)
data = f.read()
root = ElementTree.fromstring(data) root = ElementTree.fromstring(data)
pc1 = root.find( pc1 = root.find(
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1") "{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
@ -55,17 +45,19 @@ class ScrapperHTML(Scrapper):
results = [] results = []
if pc1 is not None and pc2 is not None: if pc1 is not None and pc2 is not None:
cadaster = ''.join([pc1.text, pc2.text]) cadaster = ''.join([pc1.text, pc2.text])
cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures) htmls = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures)
for cadaster_entry in cadaster_entries: for html, picture in htmls.items():
cadaster_entry = cls.parse_html_parcela(html, x, y, picture)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
results.append(cadaster_entry) results.append(cadaster_entry)
return results return results
@classmethod @classmethod
def scrap_provinces(cls, prov_list, pictures=False, start_from=''): def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from): num = ''
for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap(): if tv == DotMap() or nv == DotMap():
continue continue
@ -74,7 +66,7 @@ class ScrapperHTML(Scrapper):
counter = 1 counter = 1
while num_scrapping_fails > 0: while num_scrapping_fails > 0:
try: try:
numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) numerero_map = Scrapper.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if numerero_map.consulta_numerero.lerr.err.cod != DotMap(): if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
num_scrapping_fails -= 1 num_scrapping_fails -= 1
else: else:
@ -98,7 +90,7 @@ class ScrapperHTML(Scrapper):
cadaster_num = nump.pc.pc1 + nump.pc.pc2 cadaster_num = nump.pc.pc1 + nump.pc.pc2
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num) coords_map = Scrapper.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap(): if lon == DotMap():
@ -113,13 +105,13 @@ class ScrapperHTML(Scrapper):
num_scrapping_fails = 10 num_scrapping_fails = 10
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures) htmls = ScrapperHTML.scrap_cadaster(cadaster_num, prov_num, city_num, pictures)
for cadaster in cadaster_list: for html, picture in htmls:
cadaster.to_elasticsearch() cadaster_entry = cls.parse_html_parcela(html, lon, lat, picture)
cadaster_entry.to_elasticsearch()
counter += 1 counter += 1
sleep(config['sleep_time'])
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
logger.error( logger.error(
@ -141,72 +133,6 @@ class ScrapperHTML(Scrapper):
num_scrapping_fails -= 1 num_scrapping_fails -= 1
counter += 1 counter += 1
sleep(config['sleep_time'])
@classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2)
logger.debug("URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
if delimitacion is None:
delimitacion_search = re.search(r'del=([0-9]+)&', html)
if delimitacion_search:
delimitacion = delimitacion_search.group(1)
if municipio is None:
municipio_search = re.search(r'mun=([0-9]+)&', html)
if municipio_search:
municipio = municipio_search.group(1)
picture = None
if pictures:
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
sleep(config['sleep_time'])
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
cadasters = []
if description is None:
logger.debug("Multiparcela found!")
''' Multiparcela with multiple cadasters '''
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
for partial_cadaster in all_cadasters:
partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip()
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y,
picture)
cadasters.append(cadaster)
sleep(config['sleep_time'])
else:
cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
cadasters.append(cadaster)
sleep(config['sleep_time'])
return cadasters
""" Parsing """ """ Parsing """
@classmethod @classmethod
@ -260,5 +186,6 @@ class ScrapperHTML(Scrapper):
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text, dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text)) superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
descriptive_data[u'GráficoParcela']=picture
cadaster_entry = CadasterEntryHTML(descriptive_data) cadaster_entry = CadasterEntryHTML(descriptive_data)
return cadaster_entry return cadaster_entry

View File

@ -4,14 +4,13 @@
import urllib.parse import urllib.parse
from urllib import error from urllib import error
from time import sleep
import requests import requests
import xmltodict import xmltodict
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.parser import Parser
from src.librecatastro.scrapping.scrapper import Scrapper from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap from dotmap import DotMap
@ -20,28 +19,20 @@ from dotmap import DotMap
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
class ScrapperXML(Scrapper): class ParserXML(Parser):
"""Scrapper class for Catastro XML""" """Parser class for Catastro XML"""
def __init__(self): def __init__(self):
super().__init__() super().__init__()
""" Scrapping main calls """ ''' Processing calls '''
@classmethod @classmethod
def scrap_coord(cls, x, y, pictures=False): def process_search_by_coordinates(cls, x, y, pictures=False):
"""Scraps properties by coordinates""" """Scraps properties by coordinates"""
results = [] results = []
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y} xml_dict_map = ScrapperXML.get_coord(x, y)
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params)
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
pc1 = None pc1 = None
pc2 = None pc2 = None
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap(): if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
@ -55,7 +46,7 @@ class ScrapperXML(Scrapper):
if pc1 is not None and pc2 is not None: if pc1 is not None and pc2 is not None:
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2])) entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
picture = None picture = None
if entry.consulta_dnp.bico.bi.dt.loine != DotMap(): if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
# Parcela # Parcela
@ -63,42 +54,78 @@ class ScrapperXML(Scrapper):
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry.consulta_dnp.bico.bi.dt.cmc city_num = entry.consulta_dnp.bico.bi.dt.cmc
if prov_num != DotMap() and city_num != DotMap(): if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2])) picture = Scrapper.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture) cadaster_entry = CadasterEntryXML(entry, x, y, picture)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
results.append(cadaster_entry) results.append(cadaster_entry)
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap(): elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela # Multiparcela
parcelas = entry.consulta_dnp.lrcdnp.rcdnp parcelas = entry.consulta_dnp.lrcdnp.rcdnp
if not isinstance(parcelas, list): if not isinstance(parcelas, list):
parcelas = [parcelas] parcelas = [parcelas]
for parcela in parcelas: for parcela in parcelas:
prov_num = parcela.dt.loine.cp
city_num = parcela.dt.cmc
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else '' cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else '' cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else '' cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else '' cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else '' cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
if pictures: if pictures and prov_num != DotMap() and city_num != DotMap():
prov_num = parcela.dt.loine.cp picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
city_num = parcela.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster) try:
cadaster_entry = CadasterEntryXML(parcela, x, y, picture) # Try to get info by complete cadaster num
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
except:
# Cadastro did not return anything by cadaster entry (error? bug?)
# Try to get it by complete address
prov_name = parcela.dt.np
if prov_name is DotMap():
continue
city_name = parcela.dt.np
if city_name is DotMap():
continue
tv = parcela.ldt.locs.lous.lourb.dir.tv
if tv is DotMap():
tv = ''
nv = parcela.ldt.locs.lous.lourb.dir.nv
if nv is DotMap():
nv = ''
num = parcela.ldt.locs.lous.lourb.dir.pnp
if num is DotMap():
num = ''
loint = parcela.dt.locs.lous.lourb.loint
if loint is DotMap():
continue
bl = loint.bl
if bl == DotMap():
bl = ''
es = loint.es
if es == DotMap():
es = ''
pt = loint.pt
if es == DotMap():
pt = ''
pu = loint.pu
if es == DotMap():
pu = ''
sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
cadaster_entry = CadasterEntryXML(sub_entry, x, y, picture)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
results.append(cadaster_entry) results.append(cadaster_entry)
sleep(config['sleep_time'])
return results return results
@classmethod @classmethod
def scrap_provinces(cls, prov_list, pictures=False, start_from=''): def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from): for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap(): if tv == DotMap() or nv == DotMap():
continue continue
@ -106,13 +133,12 @@ class ScrapperXML(Scrapper):
counter = 1 counter = 1
while num_scrapping_fails > 0: while num_scrapping_fails > 0:
try: try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter) cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures) res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
if len(res) < 1: if len(res) < 1:
num_scrapping_fails -= 1 num_scrapping_fails -= 1
else: else:
num_scrapping_fails = 10 num_scrapping_fails = 10
sleep(config['sleep_time'])
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
logger.error( logger.error(
@ -123,7 +149,6 @@ class ScrapperXML(Scrapper):
logger.error("=============================================") logger.error("=============================================")
''' Could be a service Unavailable or denegation of service''' ''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1 num_scrapping_fails -= 1
sleep(config['sleep_dos_time'])
except Exception as e: except Exception as e:
logger.error( logger.error(
@ -134,7 +159,8 @@ class ScrapperXML(Scrapper):
num_scrapping_fails -= 1 num_scrapping_fails -= 1
counter += 1 counter += 1
sleep(config['sleep_time'])
''' Parsing calls '''
@classmethod @classmethod
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False): def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
@ -161,7 +187,7 @@ class ScrapperXML(Scrapper):
cadaster_num = nump.pc.pc1 + nump.pc.pc2 cadaster_num = nump.pc.pc1 + nump.pc.pc2
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num) coords_map = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap(): if lon == DotMap():
lon = None lon = None
@ -173,7 +199,7 @@ class ScrapperXML(Scrapper):
''' Adding to tracking file''' ''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat)) logger.info('{},{}'.format(lon, lat))
entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num) entry_map = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
picture = None picture = None
if entry_map.consulta_dnp.bico != DotMap(): if entry_map.consulta_dnp.bico != DotMap():
@ -181,14 +207,13 @@ class ScrapperXML(Scrapper):
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
if pictures and prov_num != DotMap() and city_num != DotMap(): if pictures and prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num) picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster_num)
# Parcela # Parcela
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture) cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
results.append(cadaster_entry) results.append(cadaster_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap(): elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela # Multiparcela
for site in entry_map.consulta_dnp.lrcdnp.rcdnp: for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
@ -208,18 +233,38 @@ class ScrapperXML(Scrapper):
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else '' cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else '' cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
if pictures: prov_num = parcela.dt.loine.cp
prov_num = parcela.dt.loine.cp city_num = parcela.dt.cmc
city_num = parcela.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster) if pictures and prov_num != DotMap() and city_num != DotMap():
cadaster_entry = CadasterEntryXML(parcela, lon, lat, picture) picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
try:
# Try to get info by complete cadaster num
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
except:
# Cadastro did not return anything by cadaster entry (error? bug?)
# Try to get it by complete address
loint = parcela.dt.locs.lous.lourb.loint
if loint is DotMap():
continue
bl = loint.bl
if bl == DotMap():
bl = ''
es = loint.es
if es == DotMap():
es = ''
pt = loint.pt
if es == DotMap():
pt = ''
pu = loint.pu
if es == DotMap():
pu = ''
sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
results.append(cadaster_entry) results.append(cadaster_entry)
sleep(config['sleep_time'])
return results return results

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import base64 import base64
import urllib.parse from time import sleep
from urllib.request import urlopen from urllib.request import urlopen
import urllib.parse
import requests import requests
import xmltodict import xmltodict
@ -17,29 +15,21 @@ logger = CadastroLogger(__name__).logger
class Scrapper: class Scrapper:
"""Generic Scrapper class""" """Catastro web services parametrized"""
'''Catastro web services parametrized'''
URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}" URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}"
URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
def __init__(self): def __init__(self):
pass pass
@classmethod
def scrap_coords(cls, x, y, pictures=False):
pass
@classmethod
def scrap_provinces(cls, prov_list, pictures=False):
pass
@classmethod @classmethod
def get_provinces(cls): def get_provinces(cls):
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
response = requests.get(url) response = requests.get(url)
xml = response.content xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
@ -52,6 +42,8 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
@ -70,6 +62,8 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia") url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
@ -125,6 +119,22 @@ class Scrapper:
else: else:
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv) yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
@classmethod
def scrap_site_picture(cls, prov_num, city_num, cadaster):
url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
logger.debug("URL for picture data: {}".format(url_pic))
f_pic = urlopen(url_pic)
data_ref = f_pic.read()
b64_image = base64.b64encode(data_ref).decode('utf-8')
sleep(config['sleep_time'])
return b64_image
@classmethod @classmethod
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero): def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
params = {'Provincia': provincia, params = {'Provincia': provincia,
@ -140,77 +150,20 @@ class Scrapper:
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod sleep(config['sleep_time'])
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
planta=None,puerta=None):
params = {'Provincia': provincia,
'Municipio': municipio,
'Sigla': sigla,
'Calle': calle,
'Numero': str(numero)}
if bloque:
params['Bloque'] = str(bloque)
else:
params['Bloque'] = ''
if escalera:
params['Escalera'] = escalera
else:
params['Escalera'] = ''
if planta:
params['Planta'] = str(planta)
else:
params['Planta'] = ''
if puerta:
params['Puerta'] = str(puerta)
else:
params['Puerta'] = ''
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
""" provincia and municipio are optional and can be set to ''"""
params = {"Provincia": provincia,
"Municipio": municipio,
"RC": rc}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod @classmethod
def get_coords_from_cadaster(cls, provincia, municipio, cadaster): def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster} params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4326', 'RC': cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC") url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params))) logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params) response = requests.get(url, params=params)
xml = response.content xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)) return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def scrap_site_picture(cls, prov_num, city_num, cadaster):
url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
logger.debug("URL for picture data: {}".format(url_pic))
f_pic = urlopen(url_pic)
data_ref = f_pic.read()
b64_image = base64.b64encode(data_ref).decode('utf-8')
return b64_image

View File

@ -0,0 +1,99 @@
import re
from time import sleep
from urllib.request import urlopen
from bs4 import BeautifulSoup
from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ScrapperHTML(Scrapper):
"""HTML Catastro Scrapper"""
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
def __init__(self):
super().__init__()
@classmethod
def scrap_coord(cls, x, y):
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
url = cls.URL.format(x, y)
logger.debug("URL for coordinates: {}".format(url))
f = urlopen(url)
sleep(config['sleep_time'])
return f.read()
@classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
sleep(config['sleep_time'])
return parsed_html
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2)
logger.debug("URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
sleep(config['sleep_time'])
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
if delimitacion is None:
delimitacion_search = re.search(r'del=([0-9]+)&', html)
if delimitacion_search:
delimitacion = delimitacion_search.group(1)
if municipio is None:
municipio_search = re.search(r'mun=([0-9]+)&', html)
if municipio_search:
municipio = municipio_search.group(1)
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
picture = None
if pictures:
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
sleep(config['sleep_time'])
htmls = []
if description is None:
# Multiparcela
logger.debug("Multiparcela found!")
''' Multiparcela with multiple cadasters '''
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
for partial_cadaster in all_cadasters:
partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip()
html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
htmls.append((html, picture))
sleep(config['sleep_time'])
else:
# Parcela
htmls.append((html, picture))
return htmls

View File

@ -0,0 +1,83 @@
import urllib.parse
from time import sleep
import requests
import xmltodict
from dotmap import DotMap
from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ScrapperXML(Scrapper):
def __init__(self):
super().__init__()
@classmethod
def get_coord(cls,x, y):
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params)
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
sleep(config['sleep_time'])
return xml_dict_map
@classmethod
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
""" provincia and municipio are optional and can be set to '' """
params = {"Provincia": provincia,
"Municipio": municipio,
"RC": rc}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
planta=None,puerta=None):
params = {'Provincia': provincia,
'Municipio': municipio,
'Sigla': sigla,
'Calle': calle,
'Numero': str(numero)}
if bloque:
params['Bloque'] = str(bloque)
else:
params['Bloque'] = ''
if escalera:
params['Escalera'] = escalera
else:
params['Escalera'] = ''
if planta:
params['Planta'] = str(planta)
else:
params['Planta'] = ''
if puerta:
params['Puerta'] = str(puerta)
else:
params['Puerta'] = ''
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
class Search:
class Searcher:
def __init__(self): def __init__(self):
pass pass

View File

@ -8,7 +8,7 @@ import random
from time import sleep from time import sleep
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.search import Search from src.librecatastro.scrapping.searcher import Searcher
from src.settings import config from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
from src.utils.list_utils import ListUtils from src.utils.list_utils import ListUtils
@ -17,12 +17,12 @@ from src.utils.list_utils import ListUtils
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
class CoordinatesSearch(Search): class CoordinatesSearcher(Searcher):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@classmethod @classmethod
def scrap_coordinates(cls, scrapper, filenames, pictures=False): def search_by_coordinates(cls, scrapper, filenames, pictures=False):
for r, d, files in os.walk(config['coordinates_path']): for r, d, files in os.walk(config['coordinates_path']):
for file in files: for file in files:
@ -34,12 +34,12 @@ class CoordinatesSearch(Search):
try: try:
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file)) polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
CoordinatesSearch.scrap_polygon(scrapper, polygon, pictures) CoordinatesSearcher.search_in_polygon(scrapper, polygon, pictures)
except: except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file)) logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
@classmethod @classmethod
def scrap_polygon(cls, scrapper, polygon, pictures=False): def search_in_polygon(cls, scrapper, polygon, pictures=False):
bb = polygon.get_bounding_box() bb = polygon.get_bounding_box()
lon_min = int(bb[0] * config['scale']) lon_min = int(bb[0] * config['scale'])
lon_max = int(bb[2] * config['scale']) lon_max = int(bb[2] * config['scale'])
@ -57,7 +57,7 @@ class CoordinatesSearch(Search):
logger.info('{},{}'.format(x_scaled, y_scaled)) logger.info('{},{}'.format(x_scaled, y_scaled))
try: try:
scrapper.scrap_coord(x_scaled, y_scaled, pictures) scrapper.process_search_by_coordinates(x_scaled, y_scaled, pictures)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled)) logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
@ -76,7 +76,7 @@ class CoordinatesSearch(Search):
sleep(config['sleep_time']) sleep(config['sleep_time'])
@staticmethod @staticmethod
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper): def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
start_time = time.time() start_time = time.time()
results = [] results = []
@ -88,7 +88,7 @@ class CoordinatesSearch(Search):
y_scaled = y / config['scale'] y_scaled = y / config['scale']
try: try:
result = scrapper.scrap_coord(x_scaled, y_scaled) result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if result is not None: if result is not None:
results.append(result) results.append(result)
@ -117,9 +117,9 @@ class CoordinatesSearch(Search):
return ListUtils.flat(results) return ListUtils.flat(results)
@staticmethod @staticmethod
def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper): def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
results = [] results = []
counter = times counter = matches
finished = False finished = False
for x in range(lon_min, lon_max): for x in range(lon_min, lon_max):
@ -130,7 +130,7 @@ class CoordinatesSearch(Search):
try: try:
result = scrapper.scrap_coord(x_scaled, y_scaled) result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if result is not None: if result is not None:
results.append(result) results.append(result)
@ -159,7 +159,7 @@ class CoordinatesSearch(Search):
return ListUtils.flat(results) return ListUtils.flat(results)
@staticmethod @staticmethod
def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper): def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, scrapper):
results = [] results = []
counter = times counter = times
while counter > 0: while counter > 0:
@ -170,7 +170,7 @@ class CoordinatesSearch(Search):
y_scaled = y / config['scale'] y_scaled = y / config['scale']
try: try:
cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled) cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if len(cadaster_entry) > 0: if len(cadaster_entry) > 0:
results.append(cadaster_entry) results.append(cadaster_entry)

View File

@ -1,30 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from dotmap import DotMap
from src.librecatastro.scrapping.scrapper import Scrapper
from src.librecatastro.scrapping.search import Search
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ProvincesSearch(Search):
def __init__(self):
super().__init__()
@classmethod
def scrap_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
scrapper.scrap_provinces(prov_list, pictures, start_from)
@classmethod
def list_provinces(cls):
logger.debug(DotMap.pprint(Scrapper.get_provinces()))
return
@classmethod
def list_cities(cls, prov_name):
logger.debug(DotMap.pprint(Scrapper.get_cities(prov_name)))
return

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from dotmap import DotMap
from src.librecatastro.scrapping.scrapper import Scrapper
from src.librecatastro.scrapping.searcher import Searcher
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ProvincesSearcher(Searcher):
def __init__(self):
super().__init__()
@classmethod
def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
scrapper.process_search_by_provinces(prov_list, pictures, start_from)
@classmethod
def list_provinces(cls):
dotmap = Scrapper.get_provinces()
provinces = dotmap.consulta_provinciero.provinciero.prov
for province in provinces:
logger.debug(province.np)
@classmethod
def list_cities(cls, prov_name):
dotmap = Scrapper.get_cities(prov_name)
cities = dotmap.consulta_municipiero.municipiero.muni
for city in cities:
logger.debug(city.nm)
return

View File

@ -16,5 +16,9 @@ config = {
"sleep_time": 5, "sleep_time": 5,
"sleep_dos_time": 300, "sleep_dos_time": 300,
"width_px": 120, "width_px": 120,
"height_px": 120 "height_px": 120,
"servers_down_message": "Some of the Cadastro servers are down. "
"Maintenance is usually carried out durign the night or the weekends. Please, retry later."
"As an alternative, your IP address may have been banned. Try to change your public IP"
} }

View File

@ -1,5 +0,0 @@
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Address/####ADDRESS####">
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Address"/>
<rdfs:label>####ADDRESS####</rdfs:label>
<cadaster:located_in rdf:resource="####CITY####"/>
</owl:NamedIndividual>

View File

@ -1,4 +0,0 @@
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####">
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster"/>
<rdfs:label>####CADASTER####</rdfs:label>
</owl:NamedIndividual>

View File

@ -1,5 +0,0 @@
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/City/####CITY####">
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/City"/>
<rdfs:label>####CITY####</rdfs:label>
<cadaster:located_in rdf:resource="####PROVINCE####"/>
</owl:NamedIndividual>

View File

@ -1,5 +0,0 @@
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates/####COORDINATES####">
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates"/>
<rdfs:label>####COORDINATES####</rdfs:label>
<cadaster:located_in rdf:resource="####ADDRESS####"/>
</owl:NamedIndividual>

View File

@ -1,5 +0,0 @@
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Province/####PROVINCE####">
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Province"/>
<rdfs:label>####PROVINCE####</rdfs:label>
<cadaster:mentioned_in rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####"/>
</owl:NamedIndividual>

View File

@ -1,96 +0,0 @@
<?xml version="1.0"?>
<rdf:RDF xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xml="http://www.w3.org/XML/1998/namespace"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:skos="http://www.w3.org/2004/02/skos/core#"
xmlns:terms="http://purl.org/dc/terms/">
<owl:Ontology rdf:about="http://semantic-datahub.taiger.io/ontologies/cadaster">
</owl:Ontology>
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// Classes
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<!-- OUR TOP CLASSES -->
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Thing">
<rdfs:label>Thing</rdfs:label>
</owl:Class>
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/CadasterEntry">
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
<rdfs:label>CadasterEntry</rdfs:label>
</owl:Class>
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Address">
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
<rdfs:label>Address</rdfs:label>
</owl:Class>
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Province">
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
<rdfs:label>Province</rdfs:label>
</owl:Class>
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/City">
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
<rdfs:label>City</rdfs:label>
</owl:Class>
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates">
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
<rdfs:label>Geographical Coordinates</rdfs:label>
</owl:Class>
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// Individuals
//
///////////////////////////////////////////////////////////////////////////////////////
-->
####INDIVIDUALS####
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// Annotation properties
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<!-- Left empty -->
<!--
///////////////////////////////////////////////////////////////////////////////////////
//
// Object Properties
//
///////////////////////////////////////////////////////////////////////////////////////
-->
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/mentioned_in">
</owl:ObjectProperty>
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/located_in">
</owl:ObjectProperty>
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/registers">
</owl:ObjectProperty>
<!-- Here, for each field of the document, if it has a parent... -->
</rdf:RDF>

View File

View File

@ -5,8 +5,8 @@ import os
import unittest import unittest
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.settings import config from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils from src.utils.elasticsearch_utils import ElasticSearchUtils
@ -22,17 +22,17 @@ class ScrapperHTMLTests(unittest.TestCase):
assert True assert True
def test_coordinate_creates_cadaster(self): def test_coordinate_creates_cadaster(self):
cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47) cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1) self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0] cadaster = cadaster_list[0]
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK') self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
def test_coordinate_multiparcela_creates_cadaster(self): def test_coordinate_multiparcela_creates_cadaster(self):
cadaster_list = ScrapperHTML.scrap_coord(-0.33, 39.47) cadaster_list = ScrapperHTML.parse_coord(-0.33, 39.47)
self.assertTrue(len(cadaster_list) > 1) self.assertTrue(len(cadaster_list) > 1)
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self): def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47) cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1) self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0] cadaster = cadaster_list[0]
cadaster.to_elasticsearch() cadaster.to_elasticsearch()
@ -92,7 +92,7 @@ class ScrapperHTMLTests(unittest.TestCase):
def scrap_random_until_x_times_found(self, times): def scrap_random_until_x_times_found(self, times):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json')) polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box() coord = polygon.get_bounding_box()
cadaster_list = CoordinatesSearch.scrap_results_random_x_times(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML) cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
self.assertTrue(len(cadaster_list) >= times) self.assertTrue(len(cadaster_list) >= times)
return cadaster_list return cadaster_list

View File

@ -5,44 +5,48 @@ import unittest
from time import sleep from time import sleep
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML from src.librecatastro.scrapping.parsers.parser_xml import ScrapperXML, ParserXML
from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.settings import config from src.settings import config
class ScrapperXMLTests(unittest.TestCase): class ScrapperXMLTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self): def test_scrapper_retrieves_dict_provinces(self):
self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48') try:
sleep(config['sleep_time']) self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_cities(self): def test_scrapper_retrieves_dict_cities(self):
self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141') try:
sleep(config['sleep_time']) self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_addresses(self): def test_scrapper_retrieves_dict_addresses(self):
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117') try:
sleep(config['sleep_time']) self.assertEqual(ScrapperXML.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_get_cadaster_entries_by_cadaster_is_up(self):
cadasters = ['2503906VK4820D0001MX']
try:
for cadaster in cadasters:
ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster)
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_addresses_iter(self): def test_scrapper_retrieves_dict_addresses_iter(self):
iterator = ScrapperXML.get_address_iter() iterator = ScrapperXML.get_address_iter()
address = iterator.__next__() address = iterator.__next__()
self.assertEqual(address[1], '15') self.assertEqual(address[1], '15')
self.assertEqual(address[3], '7') self.assertEqual(address[3], '7')
sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry(self):
dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
self.assertNotEqual(dotmap_res, DotMap())
sleep(config['sleep_time'])
def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
cadaster_entry = CadasterEntryXML(entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_multiparcela_creates_n_entries_in_elasticsearch(self): def test_multiparcela_creates_n_entries_in_elasticsearch(self):
prov_name = u'A CORUÑA' prov_name = u'A CORUÑA'
@ -127,7 +131,7 @@ class ScrapperXMLTests(unittest.TestCase):
def test_multiparcela_coord_creates_n_entries(self): def test_multiparcela_coord_creates_n_entries(self):
lon = -9.2503 lon = -9.2503
lat = 42.9723 lat = 42.9723
self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2) self.assertEqual(len(ParserXML.process_search_by_coordinates(lon, lat, True)), 2)
def test_multiparcela_address_creates_n_entries(self): def test_multiparcela_address_creates_n_entries(self):
prov_name = u'MADRID' prov_name = u'MADRID'
@ -136,7 +140,7 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CANARIAS' nv = u'CANARIAS'
num = 7 num = 7
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
def test_multiparcela_address_creates_n_entries_2(self): def test_multiparcela_address_creates_n_entries_2(self):
prov_name = u'MADRID' prov_name = u'MADRID'
@ -145,7 +149,39 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CALVARIO' nv = u'CALVARIO'
num = 38 num = 38
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num) cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8) self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
def test_poligono_or_rural_creates_entry(self):
tv = 'CL'
nv = 'TORREJON'
num = 30
prov_name = 'MADRID'
city_name = 'AJALVIR'
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
def test_coordinates_are_in_good_format(self):
tv = 'CL'
nv = 'DE BENICARLO'
num = 1
prov_name = 'MADRID'
city_name = 'GALAPAGAR'
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374)
self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211)
def test_multiparcela_coordinates_are_in_good_format(self):
tv = 'CL'
nv = 'SAN VICENTE'
num = 26
prov_name = 'ALACANT'
city_name = 'ALICANTE/ALACANT'
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
for cadaster_entry in cadaster_entries:
self.assertEqual(cadaster_entry.location.lat, 38.3495195831056)
self.assertEqual(cadaster_entry.location.lon, -0.484612452235845)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ logger = CadastroLogger(__name__).logger
class ElasticSearchUtils: class ElasticSearchUtils:
"""Custom class for managing Elastic Search queries""" """Custom class for managing Elastic Searcher queries"""
def __init__(self): def __init__(self):
pass pass

View File

@ -1,77 +0,0 @@
import copy
import re
class OntologyConverter:
def __init__(self):
with open("../templates/ontology.owl") as ont_f, \
open("../templates/individual_city.xml") as ind_city_f, \
open("../templates/individual_province.xml") as ind_province_f, \
open("../templates/individual_coord.xml") as ind_coord_f, \
open("../templates/individual_address.xml") as ind_address_f, \
open("../templates/individual_cadaster.xml") as ind_cadaster_f:
self.ont_template = ont_f.read()
self.city_template = ind_city_f.read()
self.province_template = ind_province_f.read()
self.coord_template = ind_coord_f.read()
self.address_template = ind_address_f.read()
self.cadaster_template = ind_cadaster_f.read()
def cadastro_dict_to_ontology(self, cadastro_list):
ont = copy.deepcopy(self.ont_template)
for cadastro_entry in cadastro_list:
ont = ont.replace("####INDIVIDUALS####", ''.join(["####INDIVIDUALS####",
self.instantiate_individual(cadastro_entry)]))
ont = ont.replace("####INDIVIDUALS####", '')
return ont
def instantiate_individual(self, cadastro_entry):
individuals = ''
cadaster = ''
for header, value in cadastro_entry.items():
if header == 'Referencia catastral':
txt = copy.deepcopy(self.cadaster_template)
txt = txt.replace("####CADASTER####", value)
individuals = ''.join([individuals, txt])
cadaster = value
elif header == 'Localización':
city_txt = copy.deepcopy(self.city_template)
province_txt = copy.deepcopy(self.province_template)
address_txt = copy.deepcopy(self.address_template)
cp = re.search(r'[0-9]{5}', value)
cp_span = cp.span()
cp_span_end = cp_span[1]
city_text = value[cp_span_end:]
province = re.search(r'\(([^\)]+)\)', city_text)
province_span = province.span()
province_start = province_span[0]
province_end = province_span[1]
province_text = value[province_start:province_end]
province_txt = province_txt.replace("####CADASTER####", cadaster)
province_txt = province_txt.replace("####PROVINCE####", province_text)
city_txt = city_txt.replace("####CITY####", city_text)
city_txt = city_txt.replace("####PROVINCE####", province_text)
address_txt = address_txt.replace("####ADDRESS####", value)
address_txt = address_txt.replace("####CITY####", city_text)
individuals = ''.join([individuals, province_txt, city_txt, address_txt])
#print(individuals)
return individuals