mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-06-13 12:14:10 +02:00
Refactors and disgregates scrapping and parsing into different classes for maintenability
This commit is contained in:
parent
fef84a9f95
commit
7cf208a4c2
|
@ -1,6 +1,6 @@
|
||||||
#libreCATASTRO
|
#libreCATASTRO
|
||||||
An opensource, MIT-licensed application that scraps the official Spanish
|
An opensource, MIT-licensed application that scraps the official Spanish
|
||||||
Cadaster registry and stores information in Elastic Search.
|
Cadaster registry and stores information in Elastic Searcher.
|
||||||
|
|
||||||
**Features**
|
**Features**
|
||||||
|
|
||||||
|
|
|
@ -4,10 +4,10 @@
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
|
from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
|
||||||
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
|
from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
|
||||||
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
|
from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
|
||||||
from src.librecatastro.scrapping.searchers.provinces_search import ProvincesSearch
|
from src.librecatastro.scrapping.searchers.provinces_searcher import ProvincesSearcher
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -31,7 +31,7 @@ if __name__ == "__main__":
|
||||||
if args.scale:
|
if args.scale:
|
||||||
config['scale'] = args.scale
|
config['scale'] = args.scale
|
||||||
|
|
||||||
scrapper = ScrapperHTML if args.html else ScrapperXML
|
scrapper = ScrapperHTML if args.html else ParserXML
|
||||||
|
|
||||||
filenames = args.filenames
|
filenames = args.filenames
|
||||||
pictures = args.pictures
|
pictures = args.pictures
|
||||||
|
@ -39,14 +39,14 @@ if __name__ == "__main__":
|
||||||
startcity = args.startcity
|
startcity = args.startcity
|
||||||
|
|
||||||
if args.listprovinces:
|
if args.listprovinces:
|
||||||
ProvincesSearch.list_provinces()
|
ProvincesSearcher.list_provinces()
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
if len(args.listcities) == 1:
|
if len(args.listcities) == 1:
|
||||||
ProvincesSearch.list_cities(args.listcities[0])
|
ProvincesSearcher.list_cities(args.listcities[0])
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
if args.coords:
|
if args.coords:
|
||||||
CoordinatesSearch.scrap_coordinates(scrapper, filenames, pictures)
|
CoordinatesSearcher.search_by_coordinates(scrapper, filenames, pictures)
|
||||||
else:
|
else:
|
||||||
ProvincesSearch.scrap_provinces(scrapper, provinces, pictures, startcity)
|
ProvincesSearcher.search_by_provinces(scrapper, provinces, pictures, startcity)
|
||||||
|
|
|
@ -11,7 +11,7 @@ logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class Address:
|
class Address:
|
||||||
""" Domain class for storing Address in Catastro format"""
|
""" Domain class for storing Address in Catastro parsers"""
|
||||||
def __init__(self, address):
|
def __init__(self, address):
|
||||||
self.full_address = address.strip()
|
self.full_address = address.strip()
|
||||||
|
|
||||||
|
|
23
src/librecatastro/scrapping/parser.py
Normal file
23
src/librecatastro/scrapping/parser.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
|
'''Logger'''
|
||||||
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
"""Generic Parser class"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
''' Processing signatures'''
|
||||||
|
@classmethod
|
||||||
|
def process_search_by_coordinates(cls, x, y, pictures=False):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def process_search_by_provinces(cls, prov_list, pictures=False):
|
||||||
|
pass
|
|
@ -1,17 +1,16 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
|
||||||
import urllib.error
|
import urllib.error
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from urllib.request import urlopen
|
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from dotmap import DotMap
|
from dotmap import DotMap
|
||||||
|
|
||||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
|
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
|
||||||
|
from src.librecatastro.scrapping.parser import Parser
|
||||||
from src.librecatastro.scrapping.scrapper import Scrapper
|
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||||
|
from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
|
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
@ -20,32 +19,23 @@ from src.utils.cadastro_logger import CadastroLogger
|
||||||
logger = CadastroLogger(__name__).logger
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class ScrapperHTML(Scrapper):
|
class ParserHTML(Parser):
|
||||||
"""Scrapper class for Catastro HTML"""
|
"""Parser class for Catastro HTML"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
'''Catastro web services parametrized'''
|
|
||||||
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
|
|
||||||
|
|
||||||
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
|
||||||
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
|
||||||
|
|
||||||
'''Information to scrap from HTML'''
|
'''Information to scrap from HTML'''
|
||||||
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
|
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
|
||||||
u'Superficie construida', u'Año construcción']
|
u'Superficie construida', u'Año construcción']
|
||||||
|
|
||||||
gsurface_field_names = [u'Superficie gráfica']
|
gsurface_field_names = [u'Superficie gráfica']
|
||||||
|
|
||||||
""" Scrapping calls """
|
""" Processing """
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_coord(cls, x, y, pictures=False):
|
def process_search_by_coordinates(cls, x, y, pictures=False):
|
||||||
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
data = ScrapperHTML.scrap_coord(x, y)
|
||||||
url = cls.URL.format(x, y)
|
|
||||||
logger.debug("URL for coordinates: {}".format(url))
|
|
||||||
f = urlopen(url)
|
|
||||||
data = f.read()
|
|
||||||
root = ElementTree.fromstring(data)
|
root = ElementTree.fromstring(data)
|
||||||
pc1 = root.find(
|
pc1 = root.find(
|
||||||
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
|
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
|
||||||
|
@ -55,17 +45,19 @@ class ScrapperHTML(Scrapper):
|
||||||
results = []
|
results = []
|
||||||
if pc1 is not None and pc2 is not None:
|
if pc1 is not None and pc2 is not None:
|
||||||
cadaster = ''.join([pc1.text, pc2.text])
|
cadaster = ''.join([pc1.text, pc2.text])
|
||||||
cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures)
|
htmls = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures)
|
||||||
for cadaster_entry in cadaster_entries:
|
for html, picture in htmls.items():
|
||||||
|
cadaster_entry = cls.parse_html_parcela(html, x, y, picture)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
|
def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
|
||||||
|
|
||||||
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
|
num = ''
|
||||||
|
for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
|
||||||
|
|
||||||
if tv == DotMap() or nv == DotMap():
|
if tv == DotMap() or nv == DotMap():
|
||||||
continue
|
continue
|
||||||
|
@ -74,7 +66,7 @@ class ScrapperHTML(Scrapper):
|
||||||
counter = 1
|
counter = 1
|
||||||
while num_scrapping_fails > 0:
|
while num_scrapping_fails > 0:
|
||||||
try:
|
try:
|
||||||
numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
numerero_map = Scrapper.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||||
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
|
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
|
||||||
num_scrapping_fails -= 1
|
num_scrapping_fails -= 1
|
||||||
else:
|
else:
|
||||||
|
@ -98,7 +90,7 @@ class ScrapperHTML(Scrapper):
|
||||||
|
|
||||||
cadaster_num = nump.pc.pc1 + nump.pc.pc2
|
cadaster_num = nump.pc.pc1 + nump.pc.pc2
|
||||||
|
|
||||||
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
coords_map = Scrapper.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
||||||
|
|
||||||
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
|
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
|
||||||
if lon == DotMap():
|
if lon == DotMap():
|
||||||
|
@ -113,13 +105,13 @@ class ScrapperHTML(Scrapper):
|
||||||
|
|
||||||
num_scrapping_fails = 10
|
num_scrapping_fails = 10
|
||||||
|
|
||||||
cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
|
htmls = ScrapperHTML.scrap_cadaster(cadaster_num, prov_num, city_num, pictures)
|
||||||
|
|
||||||
for cadaster in cadaster_list:
|
for html, picture in htmls:
|
||||||
cadaster.to_elasticsearch()
|
cadaster_entry = cls.parse_html_parcela(html, lon, lat, picture)
|
||||||
|
cadaster_entry.to_elasticsearch()
|
||||||
|
|
||||||
counter += 1
|
counter += 1
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -141,72 +133,6 @@ class ScrapperHTML(Scrapper):
|
||||||
num_scrapping_fails -= 1
|
num_scrapping_fails -= 1
|
||||||
|
|
||||||
counter += 1
|
counter += 1
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
|
|
||||||
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
|
||||||
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
|
|
||||||
f_ref = urlopen(url_ref)
|
|
||||||
data_ref = f_ref.read()
|
|
||||||
html = str(data_ref.decode('utf-8'))
|
|
||||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
|
||||||
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
|
|
||||||
rc_1 = cadaster[0:7]
|
|
||||||
rc_2 = cadaster[7:14]
|
|
||||||
url_ref = cls.URL_REF.format(rc_1, rc_2)
|
|
||||||
|
|
||||||
logger.debug("URL for cadastral data: {}".format(url_ref))
|
|
||||||
|
|
||||||
f_ref = urlopen(url_ref)
|
|
||||||
data_ref = f_ref.read()
|
|
||||||
html = str(data_ref.decode('utf-8'))
|
|
||||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
|
||||||
|
|
||||||
if delimitacion is None:
|
|
||||||
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
|
||||||
if delimitacion_search:
|
|
||||||
delimitacion = delimitacion_search.group(1)
|
|
||||||
|
|
||||||
if municipio is None:
|
|
||||||
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
|
||||||
if municipio_search:
|
|
||||||
municipio = municipio_search.group(1)
|
|
||||||
|
|
||||||
picture = None
|
|
||||||
if pictures:
|
|
||||||
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
|
||||||
|
|
||||||
cadasters = []
|
|
||||||
if description is None:
|
|
||||||
logger.debug("Multiparcela found!")
|
|
||||||
''' Multiparcela with multiple cadasters '''
|
|
||||||
|
|
||||||
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
|
||||||
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
|
|
||||||
for partial_cadaster in all_cadasters:
|
|
||||||
partial_cadaster_ref = partial_cadaster.find("b")
|
|
||||||
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
|
||||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
|
||||||
cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y,
|
|
||||||
picture)
|
|
||||||
cadasters.append(cadaster)
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
else:
|
|
||||||
cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
|
|
||||||
|
|
||||||
cadasters.append(cadaster)
|
|
||||||
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
return cadasters
|
|
||||||
|
|
||||||
""" Parsing """
|
""" Parsing """
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -260,5 +186,6 @@ class ScrapperHTML(Scrapper):
|
||||||
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
|
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
|
||||||
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
|
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
|
||||||
|
|
||||||
|
descriptive_data[u'GráficoParcela']=picture
|
||||||
cadaster_entry = CadasterEntryHTML(descriptive_data)
|
cadaster_entry = CadasterEntryHTML(descriptive_data)
|
||||||
return cadaster_entry
|
return cadaster_entry
|
|
@ -4,14 +4,13 @@
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from urllib import error
|
from urllib import error
|
||||||
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import xmltodict
|
import xmltodict
|
||||||
|
|
||||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
||||||
|
from src.librecatastro.scrapping.parser import Parser
|
||||||
from src.librecatastro.scrapping.scrapper import Scrapper
|
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||||
from src.settings import config
|
from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
from dotmap import DotMap
|
from dotmap import DotMap
|
||||||
|
@ -20,28 +19,20 @@ from dotmap import DotMap
|
||||||
logger = CadastroLogger(__name__).logger
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class ScrapperXML(Scrapper):
|
class ParserXML(Parser):
|
||||||
"""Scrapper class for Catastro XML"""
|
"""Parser class for Catastro XML"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
""" Scrapping main calls """
|
''' Processing calls '''
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_coord(cls, x, y, pictures=False):
|
def process_search_by_coordinates(cls, x, y, pictures=False):
|
||||||
"""Scraps properties by coordinates"""
|
"""Scraps properties by coordinates"""
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
|
xml_dict_map = ScrapperXML.get_coord(x, y)
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
|
|
||||||
response = requests.get(url, params=params)
|
|
||||||
|
|
||||||
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
|
||||||
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
|
||||||
|
|
||||||
xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
|
|
||||||
pc1 = None
|
pc1 = None
|
||||||
pc2 = None
|
pc2 = None
|
||||||
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
|
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
|
||||||
|
@ -55,7 +46,7 @@ class ScrapperXML(Scrapper):
|
||||||
|
|
||||||
if pc1 is not None and pc2 is not None:
|
if pc1 is not None and pc2 is not None:
|
||||||
|
|
||||||
entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
|
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
|
||||||
picture = None
|
picture = None
|
||||||
if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
|
if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
|
||||||
# Parcela
|
# Parcela
|
||||||
|
@ -63,42 +54,78 @@ class ScrapperXML(Scrapper):
|
||||||
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
|
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
|
||||||
city_num = entry.consulta_dnp.bico.bi.dt.cmc
|
city_num = entry.consulta_dnp.bico.bi.dt.cmc
|
||||||
if prov_num != DotMap() and city_num != DotMap():
|
if prov_num != DotMap() and city_num != DotMap():
|
||||||
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
|
picture = Scrapper.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
|
||||||
cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture)
|
cadaster_entry = CadasterEntryXML(entry, x, y, picture)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
sleep(config['sleep_time'])
|
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
||||||
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
|
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
|
||||||
# Multiparcela
|
# Multiparcela
|
||||||
parcelas = entry.consulta_dnp.lrcdnp.rcdnp
|
parcelas = entry.consulta_dnp.lrcdnp.rcdnp
|
||||||
if not isinstance(parcelas, list):
|
if not isinstance(parcelas, list):
|
||||||
parcelas = [parcelas]
|
parcelas = [parcelas]
|
||||||
|
|
||||||
for parcela in parcelas:
|
for parcela in parcelas:
|
||||||
|
|
||||||
|
prov_num = parcela.dt.loine.cp
|
||||||
|
city_num = parcela.dt.cmc
|
||||||
|
|
||||||
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
|
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
|
||||||
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
|
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
|
||||||
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
|
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
|
||||||
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
|
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
|
||||||
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
|
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
|
||||||
|
|
||||||
if pictures:
|
if pictures and prov_num != DotMap() and city_num != DotMap():
|
||||||
prov_num = parcela.dt.loine.cp
|
picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
|
||||||
city_num = parcela.dt.cmc
|
|
||||||
if prov_num != DotMap() and city_num != DotMap():
|
|
||||||
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
|
|
||||||
|
|
||||||
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
|
try:
|
||||||
cadaster_entry = CadasterEntryXML(parcela, x, y, picture)
|
# Try to get info by complete cadaster num
|
||||||
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
|
||||||
|
except:
|
||||||
|
# Cadastro did not return anything by cadaster entry (error? bug?)
|
||||||
|
# Try to get it by complete address
|
||||||
|
prov_name = parcela.dt.np
|
||||||
|
if prov_name is DotMap():
|
||||||
|
continue
|
||||||
|
city_name = parcela.dt.np
|
||||||
|
if city_name is DotMap():
|
||||||
|
continue
|
||||||
|
tv = parcela.ldt.locs.lous.lourb.dir.tv
|
||||||
|
if tv is DotMap():
|
||||||
|
tv = ''
|
||||||
|
nv = parcela.ldt.locs.lous.lourb.dir.nv
|
||||||
|
if nv is DotMap():
|
||||||
|
nv = ''
|
||||||
|
num = parcela.ldt.locs.lous.lourb.dir.pnp
|
||||||
|
if num is DotMap():
|
||||||
|
num = ''
|
||||||
|
|
||||||
|
loint = parcela.dt.locs.lous.lourb.loint
|
||||||
|
if loint is DotMap():
|
||||||
|
continue
|
||||||
|
bl = loint.bl
|
||||||
|
if bl == DotMap():
|
||||||
|
bl = ''
|
||||||
|
es = loint.es
|
||||||
|
if es == DotMap():
|
||||||
|
es = ''
|
||||||
|
pt = loint.pt
|
||||||
|
if es == DotMap():
|
||||||
|
pt = ''
|
||||||
|
pu = loint.pu
|
||||||
|
if es == DotMap():
|
||||||
|
pu = ''
|
||||||
|
sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
|
||||||
|
|
||||||
|
cadaster_entry = CadasterEntryXML(sub_entry, x, y, picture)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
|
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
||||||
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
|
def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
|
||||||
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
|
for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
|
||||||
if tv == DotMap() or nv == DotMap():
|
if tv == DotMap() or nv == DotMap():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -106,13 +133,12 @@ class ScrapperXML(Scrapper):
|
||||||
counter = 1
|
counter = 1
|
||||||
while num_scrapping_fails > 0:
|
while num_scrapping_fails > 0:
|
||||||
try:
|
try:
|
||||||
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||||
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
|
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
|
||||||
if len(res) < 1:
|
if len(res) < 1:
|
||||||
num_scrapping_fails -= 1
|
num_scrapping_fails -= 1
|
||||||
else:
|
else:
|
||||||
num_scrapping_fails = 10
|
num_scrapping_fails = 10
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -123,7 +149,6 @@ class ScrapperXML(Scrapper):
|
||||||
logger.error("=============================================")
|
logger.error("=============================================")
|
||||||
''' Could be a service Unavailable or denegation of service'''
|
''' Could be a service Unavailable or denegation of service'''
|
||||||
num_scrapping_fails -= 1
|
num_scrapping_fails -= 1
|
||||||
sleep(config['sleep_dos_time'])
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -134,7 +159,8 @@ class ScrapperXML(Scrapper):
|
||||||
num_scrapping_fails -= 1
|
num_scrapping_fails -= 1
|
||||||
|
|
||||||
counter += 1
|
counter += 1
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
''' Parsing calls '''
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
|
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
|
||||||
|
@ -161,7 +187,7 @@ class ScrapperXML(Scrapper):
|
||||||
|
|
||||||
cadaster_num = nump.pc.pc1 + nump.pc.pc2
|
cadaster_num = nump.pc.pc1 + nump.pc.pc2
|
||||||
|
|
||||||
coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
coords_map = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
||||||
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
|
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
|
||||||
if lon == DotMap():
|
if lon == DotMap():
|
||||||
lon = None
|
lon = None
|
||||||
|
@ -173,7 +199,7 @@ class ScrapperXML(Scrapper):
|
||||||
''' Adding to tracking file'''
|
''' Adding to tracking file'''
|
||||||
logger.info('{},{}'.format(lon, lat))
|
logger.info('{},{}'.format(lon, lat))
|
||||||
|
|
||||||
entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
entry_map = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||||
picture = None
|
picture = None
|
||||||
if entry_map.consulta_dnp.bico != DotMap():
|
if entry_map.consulta_dnp.bico != DotMap():
|
||||||
|
|
||||||
|
@ -181,14 +207,13 @@ class ScrapperXML(Scrapper):
|
||||||
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
|
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
|
||||||
|
|
||||||
if pictures and prov_num != DotMap() and city_num != DotMap():
|
if pictures and prov_num != DotMap() and city_num != DotMap():
|
||||||
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
|
picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster_num)
|
||||||
|
|
||||||
# Parcela
|
# Parcela
|
||||||
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
|
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
|
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
|
||||||
# Multiparcela
|
# Multiparcela
|
||||||
for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
|
for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
|
||||||
|
@ -208,18 +233,38 @@ class ScrapperXML(Scrapper):
|
||||||
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
|
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
|
||||||
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
|
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
|
||||||
|
|
||||||
if pictures:
|
prov_num = parcela.dt.loine.cp
|
||||||
prov_num = parcela.dt.loine.cp
|
city_num = parcela.dt.cmc
|
||||||
city_num = parcela.dt.cmc
|
|
||||||
if prov_num != DotMap() and city_num != DotMap():
|
|
||||||
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
|
|
||||||
|
|
||||||
parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
|
if pictures and prov_num != DotMap() and city_num != DotMap():
|
||||||
cadaster_entry = CadasterEntryXML(parcela, lon, lat, picture)
|
picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try to get info by complete cadaster num
|
||||||
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
|
||||||
|
except:
|
||||||
|
# Cadastro did not return anything by cadaster entry (error? bug?)
|
||||||
|
# Try to get it by complete address
|
||||||
|
loint = parcela.dt.locs.lous.lourb.loint
|
||||||
|
if loint is DotMap():
|
||||||
|
continue
|
||||||
|
bl = loint.bl
|
||||||
|
if bl == DotMap():
|
||||||
|
bl = ''
|
||||||
|
es = loint.es
|
||||||
|
if es == DotMap():
|
||||||
|
es = ''
|
||||||
|
pt = loint.pt
|
||||||
|
if es == DotMap():
|
||||||
|
pt = ''
|
||||||
|
pu = loint.pu
|
||||||
|
if es == DotMap():
|
||||||
|
pu = ''
|
||||||
|
sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
|
||||||
|
|
||||||
|
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
|
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
||||||
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
return results
|
return results
|
|
@ -1,9 +1,7 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import urllib.parse
|
from time import sleep
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import xmltodict
|
import xmltodict
|
||||||
|
@ -17,29 +15,21 @@ logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class Scrapper:
|
class Scrapper:
|
||||||
"""Generic Scrapper class"""
|
"""Catastro web services parametrized"""
|
||||||
|
|
||||||
'''Catastro web services parametrized'''
|
|
||||||
URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
|
|
||||||
|
|
||||||
URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}"
|
URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}"
|
||||||
|
URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_coords(cls, x, y, pictures=False):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_provinces(cls, prov_list, pictures=False):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_provinces(cls):
|
def get_provinces(cls):
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
xml = response.content
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -52,6 +42,8 @@ class Scrapper:
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
|
||||||
response = requests.get(url, params=params)
|
response = requests.get(url, params=params)
|
||||||
xml = response.content
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -70,6 +62,8 @@ class Scrapper:
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
|
||||||
response = requests.get(url, params=params)
|
response = requests.get(url, params=params)
|
||||||
xml = response.content
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -125,6 +119,22 @@ class Scrapper:
|
||||||
else:
|
else:
|
||||||
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
|
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrap_site_picture(cls, prov_num, city_num, cadaster):
|
||||||
|
|
||||||
|
url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
|
||||||
|
|
||||||
|
logger.debug("URL for picture data: {}".format(url_pic))
|
||||||
|
|
||||||
|
f_pic = urlopen(url_pic)
|
||||||
|
|
||||||
|
data_ref = f_pic.read()
|
||||||
|
|
||||||
|
b64_image = base64.b64encode(data_ref).decode('utf-8')
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return b64_image
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
|
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
|
||||||
params = {'Provincia': provincia,
|
params = {'Provincia': provincia,
|
||||||
|
@ -140,77 +150,20 @@ class Scrapper:
|
||||||
|
|
||||||
response = requests.get(url, params=params)
|
response = requests.get(url, params=params)
|
||||||
xml = response.content
|
xml = response.content
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
|
||||||
|
|
||||||
@classmethod
|
sleep(config['sleep_time'])
|
||||||
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
|
|
||||||
planta=None,puerta=None):
|
|
||||||
params = {'Provincia': provincia,
|
|
||||||
'Municipio': municipio,
|
|
||||||
'Sigla': sigla,
|
|
||||||
'Calle': calle,
|
|
||||||
'Numero': str(numero)}
|
|
||||||
if bloque:
|
|
||||||
params['Bloque'] = str(bloque)
|
|
||||||
else:
|
|
||||||
params['Bloque'] = ''
|
|
||||||
if escalera:
|
|
||||||
params['Escalera'] = escalera
|
|
||||||
else:
|
|
||||||
params['Escalera'] = ''
|
|
||||||
if planta:
|
|
||||||
params['Planta'] = str(planta)
|
|
||||||
else:
|
|
||||||
params['Planta'] = ''
|
|
||||||
if puerta:
|
|
||||||
params['Puerta'] = str(puerta)
|
|
||||||
else:
|
|
||||||
params['Puerta'] = ''
|
|
||||||
|
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
|
|
||||||
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
|
||||||
|
|
||||||
response = requests.get(url, params=params)
|
|
||||||
xml = response.content
|
|
||||||
|
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
|
|
||||||
""" provincia and municipio are optional and can be set to ''"""
|
|
||||||
params = {"Provincia": provincia,
|
|
||||||
"Municipio": municipio,
|
|
||||||
"RC": rc}
|
|
||||||
|
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
|
|
||||||
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
|
||||||
response = requests.get(url, params=params)
|
|
||||||
xml = response.content
|
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
|
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
|
||||||
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
|
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4326', 'RC': cadaster}
|
||||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
|
||||||
|
|
||||||
logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||||
|
|
||||||
response = requests.get(url, params=params)
|
response = requests.get(url, params=params)
|
||||||
xml = response.content
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_site_picture(cls, prov_num, city_num, cadaster):
|
|
||||||
|
|
||||||
url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
|
|
||||||
|
|
||||||
logger.debug("URL for picture data: {}".format(url_pic))
|
|
||||||
|
|
||||||
f_pic = urlopen(url_pic)
|
|
||||||
|
|
||||||
data_ref = f_pic.read()
|
|
||||||
|
|
||||||
b64_image = base64.b64encode(data_ref).decode('utf-8')
|
|
||||||
|
|
||||||
return b64_image
|
|
||||||
|
|
||||||
|
|
99
src/librecatastro/scrapping/scrappers/scrapper_html.py
Normal file
99
src/librecatastro/scrapping/scrappers/scrapper_html.py
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
import re
|
||||||
|
from time import sleep
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||||
|
from src.settings import config
|
||||||
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
|
'''Logger'''
|
||||||
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapperHTML(Scrapper):
|
||||||
|
"""HTML Catastro Scrapper"""
|
||||||
|
|
||||||
|
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}"
|
||||||
|
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
||||||
|
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrap_coord(cls, x, y):
|
||||||
|
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
||||||
|
url = cls.URL.format(x, y)
|
||||||
|
logger.debug("URL for coordinates: {}".format(url))
|
||||||
|
f = urlopen(url)
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
|
||||||
|
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||||
|
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||||
|
f_ref = urlopen(url_ref)
|
||||||
|
data_ref = f_ref.read()
|
||||||
|
html = str(data_ref.decode('utf-8'))
|
||||||
|
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return parsed_html
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
|
||||||
|
rc_1 = cadaster[0:7]
|
||||||
|
rc_2 = cadaster[7:14]
|
||||||
|
url_ref = cls.URL_REF.format(rc_1, rc_2)
|
||||||
|
|
||||||
|
logger.debug("URL for cadastral data: {}".format(url_ref))
|
||||||
|
|
||||||
|
f_ref = urlopen(url_ref)
|
||||||
|
data_ref = f_ref.read()
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
|
html = str(data_ref.decode('utf-8'))
|
||||||
|
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||||
|
|
||||||
|
if delimitacion is None:
|
||||||
|
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
||||||
|
if delimitacion_search:
|
||||||
|
delimitacion = delimitacion_search.group(1)
|
||||||
|
|
||||||
|
if municipio is None:
|
||||||
|
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
||||||
|
if municipio_search:
|
||||||
|
municipio = municipio_search.group(1)
|
||||||
|
|
||||||
|
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||||
|
|
||||||
|
picture = None
|
||||||
|
if pictures:
|
||||||
|
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
|
htmls = []
|
||||||
|
if description is None:
|
||||||
|
# Multiparcela
|
||||||
|
logger.debug("Multiparcela found!")
|
||||||
|
''' Multiparcela with multiple cadasters '''
|
||||||
|
|
||||||
|
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
||||||
|
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
|
||||||
|
for partial_cadaster in all_cadasters:
|
||||||
|
partial_cadaster_ref = partial_cadaster.find("b")
|
||||||
|
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||||
|
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||||
|
html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
|
||||||
|
htmls.append((html, picture))
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
else:
|
||||||
|
# Parcela
|
||||||
|
htmls.append((html, picture))
|
||||||
|
|
||||||
|
return htmls
|
||||||
|
|
83
src/librecatastro/scrapping/scrappers/scrapper_xml.py
Normal file
83
src/librecatastro/scrapping/scrappers/scrapper_xml.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
import urllib.parse
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import xmltodict
|
||||||
|
from dotmap import DotMap
|
||||||
|
|
||||||
|
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||||
|
from src.settings import config
|
||||||
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
|
'''Logger'''
|
||||||
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapperXML(Scrapper):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_coord(cls,x, y):
|
||||||
|
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
|
||||||
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
|
||||||
|
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
||||||
|
logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||||
|
|
||||||
|
xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return xml_dict_map
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
|
||||||
|
""" provincia and municipio are optional and can be set to '' """
|
||||||
|
|
||||||
|
params = {"Provincia": provincia,
|
||||||
|
"Municipio": municipio,
|
||||||
|
"RC": rc}
|
||||||
|
|
||||||
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
|
||||||
|
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
|
||||||
|
planta=None,puerta=None):
|
||||||
|
params = {'Provincia': provincia,
|
||||||
|
'Municipio': municipio,
|
||||||
|
'Sigla': sigla,
|
||||||
|
'Calle': calle,
|
||||||
|
'Numero': str(numero)}
|
||||||
|
if bloque:
|
||||||
|
params['Bloque'] = str(bloque)
|
||||||
|
else:
|
||||||
|
params['Bloque'] = ''
|
||||||
|
if escalera:
|
||||||
|
params['Escalera'] = escalera
|
||||||
|
else:
|
||||||
|
params['Escalera'] = ''
|
||||||
|
if planta:
|
||||||
|
params['Planta'] = str(planta)
|
||||||
|
else:
|
||||||
|
params['Planta'] = ''
|
||||||
|
if puerta:
|
||||||
|
params['Puerta'] = str(puerta)
|
||||||
|
else:
|
||||||
|
params['Puerta'] = ''
|
||||||
|
|
||||||
|
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
|
||||||
|
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||||
|
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
xml = response.content
|
||||||
|
|
||||||
|
sleep(config['sleep_time'])
|
||||||
|
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
class Search:
|
|
||||||
|
class Searcher:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
|
@ -8,7 +8,7 @@ import random
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||||
from src.librecatastro.scrapping.search import Search
|
from src.librecatastro.scrapping.searcher import Searcher
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
from src.utils.list_utils import ListUtils
|
from src.utils.list_utils import ListUtils
|
||||||
|
@ -17,12 +17,12 @@ from src.utils.list_utils import ListUtils
|
||||||
logger = CadastroLogger(__name__).logger
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class CoordinatesSearch(Search):
|
class CoordinatesSearcher(Searcher):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_coordinates(cls, scrapper, filenames, pictures=False):
|
def search_by_coordinates(cls, scrapper, filenames, pictures=False):
|
||||||
for r, d, files in os.walk(config['coordinates_path']):
|
for r, d, files in os.walk(config['coordinates_path']):
|
||||||
for file in files:
|
for file in files:
|
||||||
|
|
||||||
|
@ -34,12 +34,12 @@ class CoordinatesSearch(Search):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
|
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
|
||||||
CoordinatesSearch.scrap_polygon(scrapper, polygon, pictures)
|
CoordinatesSearcher.search_in_polygon(scrapper, polygon, pictures)
|
||||||
except:
|
except:
|
||||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrap_polygon(cls, scrapper, polygon, pictures=False):
|
def search_in_polygon(cls, scrapper, polygon, pictures=False):
|
||||||
bb = polygon.get_bounding_box()
|
bb = polygon.get_bounding_box()
|
||||||
lon_min = int(bb[0] * config['scale'])
|
lon_min = int(bb[0] * config['scale'])
|
||||||
lon_max = int(bb[2] * config['scale'])
|
lon_max = int(bb[2] * config['scale'])
|
||||||
|
@ -57,7 +57,7 @@ class CoordinatesSearch(Search):
|
||||||
logger.info('{},{}'.format(x_scaled, y_scaled))
|
logger.info('{},{}'.format(x_scaled, y_scaled))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
scrapper.scrap_coord(x_scaled, y_scaled, pictures)
|
scrapper.process_search_by_coordinates(x_scaled, y_scaled, pictures)
|
||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||||
|
@ -76,7 +76,7 @@ class CoordinatesSearch(Search):
|
||||||
sleep(config['sleep_time'])
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
|
def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
@ -88,7 +88,7 @@ class CoordinatesSearch(Search):
|
||||||
y_scaled = y / config['scale']
|
y_scaled = y / config['scale']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
@ -117,9 +117,9 @@ class CoordinatesSearch(Search):
|
||||||
return ListUtils.flat(results)
|
return ListUtils.flat(results)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||||
results = []
|
results = []
|
||||||
counter = times
|
counter = matches
|
||||||
|
|
||||||
finished = False
|
finished = False
|
||||||
for x in range(lon_min, lon_max):
|
for x in range(lon_min, lon_max):
|
||||||
|
@ -130,7 +130,7 @@ class CoordinatesSearch(Search):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
result = scrapper.scrap_coord(x_scaled, y_scaled)
|
result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
@ -159,7 +159,7 @@ class CoordinatesSearch(Search):
|
||||||
return ListUtils.flat(results)
|
return ListUtils.flat(results)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||||
results = []
|
results = []
|
||||||
counter = times
|
counter = times
|
||||||
while counter > 0:
|
while counter > 0:
|
||||||
|
@ -170,7 +170,7 @@ class CoordinatesSearch(Search):
|
||||||
y_scaled = y / config['scale']
|
y_scaled = y / config['scale']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled)
|
cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
|
||||||
|
|
||||||
if len(cadaster_entry) > 0:
|
if len(cadaster_entry) > 0:
|
||||||
results.append(cadaster_entry)
|
results.append(cadaster_entry)
|
|
@ -1,30 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from dotmap import DotMap
|
|
||||||
|
|
||||||
from src.librecatastro.scrapping.scrapper import Scrapper
|
|
||||||
from src.librecatastro.scrapping.search import Search
|
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
|
||||||
|
|
||||||
'''Logger'''
|
|
||||||
logger = CadastroLogger(__name__).logger
|
|
||||||
|
|
||||||
|
|
||||||
class ProvincesSearch(Search):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def scrap_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
|
|
||||||
scrapper.scrap_provinces(prov_list, pictures, start_from)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def list_provinces(cls):
|
|
||||||
logger.debug(DotMap.pprint(Scrapper.get_provinces()))
|
|
||||||
return
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def list_cities(cls, prov_name):
|
|
||||||
logger.debug(DotMap.pprint(Scrapper.get_cities(prov_name)))
|
|
||||||
return
|
|
35
src/librecatastro/scrapping/searchers/provinces_searcher.py
Normal file
35
src/librecatastro/scrapping/searchers/provinces_searcher.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from dotmap import DotMap
|
||||||
|
|
||||||
|
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||||
|
from src.librecatastro.scrapping.searcher import Searcher
|
||||||
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
|
'''Logger'''
|
||||||
|
logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
|
class ProvincesSearcher(Searcher):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
|
||||||
|
scrapper.process_search_by_provinces(prov_list, pictures, start_from)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def list_provinces(cls):
|
||||||
|
dotmap = Scrapper.get_provinces()
|
||||||
|
provinces = dotmap.consulta_provinciero.provinciero.prov
|
||||||
|
for province in provinces:
|
||||||
|
logger.debug(province.np)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def list_cities(cls, prov_name):
|
||||||
|
dotmap = Scrapper.get_cities(prov_name)
|
||||||
|
cities = dotmap.consulta_municipiero.municipiero.muni
|
||||||
|
for city in cities:
|
||||||
|
logger.debug(city.nm)
|
||||||
|
return
|
|
@ -16,5 +16,9 @@ config = {
|
||||||
"sleep_time": 5,
|
"sleep_time": 5,
|
||||||
"sleep_dos_time": 300,
|
"sleep_dos_time": 300,
|
||||||
"width_px": 120,
|
"width_px": 120,
|
||||||
"height_px": 120
|
"height_px": 120,
|
||||||
|
"servers_down_message": "Some of the Cadastro servers are down. "
|
||||||
|
"Maintenance is usually carried out durign the night or the weekends. Please, retry later."
|
||||||
|
"As an alternative, your IP address may have been banned. Try to change your public IP"
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Address/####ADDRESS####">
|
|
||||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Address"/>
|
|
||||||
<rdfs:label>####ADDRESS####</rdfs:label>
|
|
||||||
<cadaster:located_in rdf:resource="####CITY####"/>
|
|
||||||
</owl:NamedIndividual>
|
|
|
@ -1,4 +0,0 @@
|
||||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####">
|
|
||||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster"/>
|
|
||||||
<rdfs:label>####CADASTER####</rdfs:label>
|
|
||||||
</owl:NamedIndividual>
|
|
|
@ -1,5 +0,0 @@
|
||||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/City/####CITY####">
|
|
||||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/City"/>
|
|
||||||
<rdfs:label>####CITY####</rdfs:label>
|
|
||||||
<cadaster:located_in rdf:resource="####PROVINCE####"/>
|
|
||||||
</owl:NamedIndividual>
|
|
|
@ -1,5 +0,0 @@
|
||||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates/####COORDINATES####">
|
|
||||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates"/>
|
|
||||||
<rdfs:label>####COORDINATES####</rdfs:label>
|
|
||||||
<cadaster:located_in rdf:resource="####ADDRESS####"/>
|
|
||||||
</owl:NamedIndividual>
|
|
|
@ -1,5 +0,0 @@
|
||||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Province/####PROVINCE####">
|
|
||||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Province"/>
|
|
||||||
<rdfs:label>####PROVINCE####</rdfs:label>
|
|
||||||
<cadaster:mentioned_in rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####"/>
|
|
||||||
</owl:NamedIndividual>
|
|
|
@ -1,96 +0,0 @@
|
||||||
<?xml version="1.0"?>
|
|
||||||
<rdf:RDF xmlns:owl="http://www.w3.org/2002/07/owl#"
|
|
||||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
||||||
xmlns:xml="http://www.w3.org/XML/1998/namespace"
|
|
||||||
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
|
|
||||||
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
|
|
||||||
xmlns:skos="http://www.w3.org/2004/02/skos/core#"
|
|
||||||
xmlns:terms="http://purl.org/dc/terms/">
|
|
||||||
<owl:Ontology rdf:about="http://semantic-datahub.taiger.io/ontologies/cadaster">
|
|
||||||
</owl:Ontology>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!--
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// Classes
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
-->
|
|
||||||
|
|
||||||
<!-- OUR TOP CLASSES -->
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Thing">
|
|
||||||
<rdfs:label>Thing</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/CadasterEntry">
|
|
||||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
|
||||||
<rdfs:label>CadasterEntry</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Address">
|
|
||||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
|
||||||
<rdfs:label>Address</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Province">
|
|
||||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
|
||||||
<rdfs:label>Province</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/City">
|
|
||||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
|
||||||
<rdfs:label>City</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates">
|
|
||||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
|
||||||
<rdfs:label>Geographical Coordinates</rdfs:label>
|
|
||||||
</owl:Class>
|
|
||||||
|
|
||||||
<!--
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// Individuals
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
####INDIVIDUALS####
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!--
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// Annotation properties
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
-->
|
|
||||||
|
|
||||||
<!-- Left empty -->
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!--
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// Object Properties
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
-->
|
|
||||||
|
|
||||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/mentioned_in">
|
|
||||||
</owl:ObjectProperty>
|
|
||||||
|
|
||||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/located_in">
|
|
||||||
</owl:ObjectProperty>
|
|
||||||
|
|
||||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/registers">
|
|
||||||
</owl:ObjectProperty>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- Here, for each field of the document, if it has a parent... -->
|
|
||||||
</rdf:RDF>
|
|
0
src/tests/scrappers/__init__.py
Normal file
0
src/tests/scrappers/__init__.py
Normal file
|
@ -5,8 +5,8 @@ import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||||
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
|
from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
|
||||||
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
|
from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||||
|
|
||||||
|
@ -22,17 +22,17 @@ class ScrapperHTMLTests(unittest.TestCase):
|
||||||
assert True
|
assert True
|
||||||
|
|
||||||
def test_coordinate_creates_cadaster(self):
|
def test_coordinate_creates_cadaster(self):
|
||||||
cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47)
|
cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
|
||||||
self.assertEqual(len(cadaster_list), 1)
|
self.assertEqual(len(cadaster_list), 1)
|
||||||
cadaster = cadaster_list[0]
|
cadaster = cadaster_list[0]
|
||||||
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
|
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
|
||||||
|
|
||||||
def test_coordinate_multiparcela_creates_cadaster(self):
|
def test_coordinate_multiparcela_creates_cadaster(self):
|
||||||
cadaster_list = ScrapperHTML.scrap_coord(-0.33, 39.47)
|
cadaster_list = ScrapperHTML.parse_coord(-0.33, 39.47)
|
||||||
self.assertTrue(len(cadaster_list) > 1)
|
self.assertTrue(len(cadaster_list) > 1)
|
||||||
|
|
||||||
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
|
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
|
||||||
cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47)
|
cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
|
||||||
self.assertEqual(len(cadaster_list), 1)
|
self.assertEqual(len(cadaster_list), 1)
|
||||||
cadaster = cadaster_list[0]
|
cadaster = cadaster_list[0]
|
||||||
cadaster.to_elasticsearch()
|
cadaster.to_elasticsearch()
|
||||||
|
@ -92,7 +92,7 @@ class ScrapperHTMLTests(unittest.TestCase):
|
||||||
def scrap_random_until_x_times_found(self, times):
|
def scrap_random_until_x_times_found(self, times):
|
||||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||||
coord = polygon.get_bounding_box()
|
coord = polygon.get_bounding_box()
|
||||||
cadaster_list = CoordinatesSearch.scrap_results_random_x_times(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
|
cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
|
||||||
self.assertTrue(len(cadaster_list) >= times)
|
self.assertTrue(len(cadaster_list) >= times)
|
||||||
return cadaster_list
|
return cadaster_list
|
||||||
|
|
|
@ -5,44 +5,48 @@ import unittest
|
||||||
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from dotmap import DotMap
|
|
||||||
|
|
||||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
||||||
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
|
from src.librecatastro.scrapping.parsers.parser_xml import ScrapperXML, ParserXML
|
||||||
|
from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
|
|
||||||
|
|
||||||
class ScrapperXMLTests(unittest.TestCase):
|
class ScrapperXMLTests(unittest.TestCase):
|
||||||
def test_scrapper_retrieves_dict_provinces(self):
|
def test_scrapper_retrieves_dict_provinces(self):
|
||||||
self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
|
try:
|
||||||
sleep(config['sleep_time'])
|
self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
|
||||||
|
except:
|
||||||
|
self.assertFalse(config['servers_down_message'])
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
def test_scrapper_retrieves_dict_cities(self):
|
def test_scrapper_retrieves_dict_cities(self):
|
||||||
self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
|
try:
|
||||||
sleep(config['sleep_time'])
|
self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
|
||||||
|
except:
|
||||||
|
self.assertFalse(config['servers_down_message'])
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
def test_scrapper_retrieves_dict_addresses(self):
|
def test_scrapper_retrieves_dict_addresses(self):
|
||||||
self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117')
|
try:
|
||||||
sleep(config['sleep_time'])
|
self.assertEqual(ScrapperXML.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117')
|
||||||
|
except:
|
||||||
|
self.assertFalse(config['servers_down_message'])
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
def test_get_cadaster_entries_by_cadaster_is_up(self):
|
||||||
|
cadasters = ['2503906VK4820D0001MX']
|
||||||
|
try:
|
||||||
|
for cadaster in cadasters:
|
||||||
|
ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster)
|
||||||
|
except:
|
||||||
|
self.assertFalse(config['servers_down_message'])
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
def test_scrapper_retrieves_dict_addresses_iter(self):
|
def test_scrapper_retrieves_dict_addresses_iter(self):
|
||||||
iterator = ScrapperXML.get_address_iter()
|
iterator = ScrapperXML.get_address_iter()
|
||||||
address = iterator.__next__()
|
address = iterator.__next__()
|
||||||
self.assertEqual(address[1], '15')
|
self.assertEqual(address[1], '15')
|
||||||
self.assertEqual(address[3], '7')
|
self.assertEqual(address[3], '7')
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
def test_scrapper_creates_cadaster_entry(self):
|
|
||||||
dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
|
|
||||||
self.assertNotEqual(dotmap_res, DotMap())
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
|
|
||||||
entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
|
|
||||||
cadaster_entry = CadasterEntryXML(entry)
|
|
||||||
cadaster_entry.to_elasticsearch()
|
|
||||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
|
||||||
sleep(config['sleep_time'])
|
|
||||||
|
|
||||||
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
|
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
|
||||||
prov_name = u'A CORUÑA'
|
prov_name = u'A CORUÑA'
|
||||||
|
@ -127,7 +131,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
def test_multiparcela_coord_creates_n_entries(self):
|
def test_multiparcela_coord_creates_n_entries(self):
|
||||||
lon = -9.2503
|
lon = -9.2503
|
||||||
lat = 42.9723
|
lat = 42.9723
|
||||||
self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2)
|
self.assertEqual(len(ParserXML.process_search_by_coordinates(lon, lat, True)), 2)
|
||||||
|
|
||||||
def test_multiparcela_address_creates_n_entries(self):
|
def test_multiparcela_address_creates_n_entries(self):
|
||||||
prov_name = u'MADRID'
|
prov_name = u'MADRID'
|
||||||
|
@ -136,7 +140,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
nv = u'CANARIAS'
|
nv = u'CANARIAS'
|
||||||
num = 7
|
num = 7
|
||||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||||
self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||||
|
|
||||||
def test_multiparcela_address_creates_n_entries_2(self):
|
def test_multiparcela_address_creates_n_entries_2(self):
|
||||||
prov_name = u'MADRID'
|
prov_name = u'MADRID'
|
||||||
|
@ -145,7 +149,39 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
nv = u'CALVARIO'
|
nv = u'CALVARIO'
|
||||||
num = 38
|
num = 38
|
||||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||||
self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||||
|
|
||||||
|
def test_poligono_or_rural_creates_entry(self):
|
||||||
|
tv = 'CL'
|
||||||
|
nv = 'TORREJON'
|
||||||
|
num = 30
|
||||||
|
prov_name = 'MADRID'
|
||||||
|
city_name = 'AJALVIR'
|
||||||
|
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||||
|
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
|
||||||
|
|
||||||
|
def test_coordinates_are_in_good_format(self):
|
||||||
|
tv = 'CL'
|
||||||
|
nv = 'DE BENICARLO'
|
||||||
|
num = 1
|
||||||
|
prov_name = 'MADRID'
|
||||||
|
city_name = 'GALAPAGAR'
|
||||||
|
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||||
|
cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||||
|
self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374)
|
||||||
|
self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211)
|
||||||
|
|
||||||
|
def test_multiparcela_coordinates_are_in_good_format(self):
|
||||||
|
tv = 'CL'
|
||||||
|
nv = 'SAN VICENTE'
|
||||||
|
num = 26
|
||||||
|
prov_name = 'ALACANT'
|
||||||
|
city_name = 'ALICANTE/ALACANT'
|
||||||
|
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||||
|
cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||||
|
for cadaster_entry in cadaster_entries:
|
||||||
|
self.assertEqual(cadaster_entry.location.lat, 38.3495195831056)
|
||||||
|
self.assertEqual(cadaster_entry.location.lon, -0.484612452235845)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
|
@ -6,7 +6,7 @@ logger = CadastroLogger(__name__).logger
|
||||||
|
|
||||||
|
|
||||||
class ElasticSearchUtils:
|
class ElasticSearchUtils:
|
||||||
"""Custom class for managing Elastic Search queries"""
|
"""Custom class for managing Elastic Searcher queries"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,77 +0,0 @@
|
||||||
import copy
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class OntologyConverter:
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
|
|
||||||
with open("../templates/ontology.owl") as ont_f, \
|
|
||||||
open("../templates/individual_city.xml") as ind_city_f, \
|
|
||||||
open("../templates/individual_province.xml") as ind_province_f, \
|
|
||||||
open("../templates/individual_coord.xml") as ind_coord_f, \
|
|
||||||
open("../templates/individual_address.xml") as ind_address_f, \
|
|
||||||
open("../templates/individual_cadaster.xml") as ind_cadaster_f:
|
|
||||||
|
|
||||||
self.ont_template = ont_f.read()
|
|
||||||
self.city_template = ind_city_f.read()
|
|
||||||
self.province_template = ind_province_f.read()
|
|
||||||
self.coord_template = ind_coord_f.read()
|
|
||||||
self.address_template = ind_address_f.read()
|
|
||||||
self.cadaster_template = ind_cadaster_f.read()
|
|
||||||
|
|
||||||
def cadastro_dict_to_ontology(self, cadastro_list):
|
|
||||||
|
|
||||||
ont = copy.deepcopy(self.ont_template)
|
|
||||||
|
|
||||||
for cadastro_entry in cadastro_list:
|
|
||||||
ont = ont.replace("####INDIVIDUALS####", ''.join(["####INDIVIDUALS####",
|
|
||||||
self.instantiate_individual(cadastro_entry)]))
|
|
||||||
|
|
||||||
ont = ont.replace("####INDIVIDUALS####", '')
|
|
||||||
|
|
||||||
return ont
|
|
||||||
|
|
||||||
def instantiate_individual(self, cadastro_entry):
|
|
||||||
individuals = ''
|
|
||||||
|
|
||||||
cadaster = ''
|
|
||||||
for header, value in cadastro_entry.items():
|
|
||||||
if header == 'Referencia catastral':
|
|
||||||
txt = copy.deepcopy(self.cadaster_template)
|
|
||||||
txt = txt.replace("####CADASTER####", value)
|
|
||||||
individuals = ''.join([individuals, txt])
|
|
||||||
cadaster = value
|
|
||||||
elif header == 'Localización':
|
|
||||||
city_txt = copy.deepcopy(self.city_template)
|
|
||||||
province_txt = copy.deepcopy(self.province_template)
|
|
||||||
address_txt = copy.deepcopy(self.address_template)
|
|
||||||
|
|
||||||
cp = re.search(r'[0-9]{5}', value)
|
|
||||||
cp_span = cp.span()
|
|
||||||
cp_span_end = cp_span[1]
|
|
||||||
|
|
||||||
city_text = value[cp_span_end:]
|
|
||||||
province = re.search(r'\(([^\)]+)\)', city_text)
|
|
||||||
province_span = province.span()
|
|
||||||
province_start = province_span[0]
|
|
||||||
province_end = province_span[1]
|
|
||||||
province_text = value[province_start:province_end]
|
|
||||||
|
|
||||||
province_txt = province_txt.replace("####CADASTER####", cadaster)
|
|
||||||
province_txt = province_txt.replace("####PROVINCE####", province_text)
|
|
||||||
|
|
||||||
city_txt = city_txt.replace("####CITY####", city_text)
|
|
||||||
city_txt = city_txt.replace("####PROVINCE####", province_text)
|
|
||||||
|
|
||||||
address_txt = address_txt.replace("####ADDRESS####", value)
|
|
||||||
address_txt = address_txt.replace("####CITY####", city_text)
|
|
||||||
|
|
||||||
individuals = ''.join([individuals, province_txt, city_txt, address_txt])
|
|
||||||
|
|
||||||
#print(individuals)
|
|
||||||
return individuals
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user