diff --git a/README.md b/README.md
index 7741c51..c0927a5 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
#libreCATASTRO
An opensource, MIT-licensed application that scraps the official Spanish
-Cadaster registry and stores information in Elastic Search.
+Cadaster registry and stores information in Elastic Searcher.
**Features**
diff --git a/libreCadastro.py b/libreCadastro.py
index 2655b61..79cb8ee 100644
--- a/libreCadastro.py
+++ b/libreCadastro.py
@@ -4,10 +4,10 @@
import sys
import argparse
-from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
-from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
-from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
-from src.librecatastro.scrapping.searchers.provinces_search import ProvincesSearch
+from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
+from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
+from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
+from src.librecatastro.scrapping.searchers.provinces_searcher import ProvincesSearcher
from src.settings import config
if __name__ == "__main__":
@@ -31,7 +31,7 @@ if __name__ == "__main__":
if args.scale:
config['scale'] = args.scale
- scrapper = ScrapperHTML if args.html else ScrapperXML
+ scrapper = ScrapperHTML if args.html else ParserXML
filenames = args.filenames
pictures = args.pictures
@@ -39,14 +39,14 @@ if __name__ == "__main__":
startcity = args.startcity
if args.listprovinces:
- ProvincesSearch.list_provinces()
+ ProvincesSearcher.list_provinces()
exit(0)
if len(args.listcities) == 1:
- ProvincesSearch.list_cities(args.listcities[0])
+ ProvincesSearcher.list_cities(args.listcities[0])
exit(0)
if args.coords:
- CoordinatesSearch.scrap_coordinates(scrapper, filenames, pictures)
+ CoordinatesSearcher.search_by_coordinates(scrapper, filenames, pictures)
else:
- ProvincesSearch.scrap_provinces(scrapper, provinces, pictures, startcity)
+ ProvincesSearcher.search_by_provinces(scrapper, provinces, pictures, startcity)
diff --git a/src/librecatastro/domain/address.py b/src/librecatastro/domain/address.py
index ce8b0be..0753df7 100644
--- a/src/librecatastro/domain/address.py
+++ b/src/librecatastro/domain/address.py
@@ -11,7 +11,7 @@ logger = CadastroLogger(__name__).logger
class Address:
- """ Domain class for storing Address in Catastro format"""
+ """ Domain class for storing Address in Catastro parsers"""
def __init__(self, address):
self.full_address = address.strip()
diff --git a/src/librecatastro/scrapping/parser.py b/src/librecatastro/scrapping/parser.py
new file mode 100644
index 0000000..cb2fa8d
--- /dev/null
+++ b/src/librecatastro/scrapping/parser.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from src.utils.cadastro_logger import CadastroLogger
+
+'''Logger'''
+logger = CadastroLogger(__name__).logger
+
+
+class Parser:
+ """Generic Parser class"""
+
+ def __init__(self):
+ pass
+
+ ''' Processing signatures'''
+ @classmethod
+ def process_search_by_coordinates(cls, x, y, pictures=False):
+ pass
+
+ @classmethod
+ def process_search_by_provinces(cls, prov_list, pictures=False):
+ pass
\ No newline at end of file
diff --git a/src/librecatastro/scrapping/format/__init__.py b/src/librecatastro/scrapping/parsers/__init__.py
similarity index 100%
rename from src/librecatastro/scrapping/format/__init__.py
rename to src/librecatastro/scrapping/parsers/__init__.py
diff --git a/src/librecatastro/scrapping/format/scrapper_html.py b/src/librecatastro/scrapping/parsers/parser_html.py
similarity index 60%
rename from src/librecatastro/scrapping/format/scrapper_html.py
rename to src/librecatastro/scrapping/parsers/parser_html.py
index a06ca4e..4b3ef4e 100644
--- a/src/librecatastro/scrapping/format/scrapper_html.py
+++ b/src/librecatastro/scrapping/parsers/parser_html.py
@@ -1,17 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import re
import urllib.error
from time import sleep
-from urllib.request import urlopen
from xml.etree import ElementTree
-from bs4 import BeautifulSoup
from dotmap import DotMap
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
+from src.librecatastro.scrapping.parser import Parser
from src.librecatastro.scrapping.scrapper import Scrapper
+from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
@@ -20,32 +19,23 @@ from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger
-class ScrapperHTML(Scrapper):
- """Scrapper class for Catastro HTML"""
+class ParserHTML(Parser):
+ """Parser class for Catastro HTML"""
def __init__(self):
super().__init__()
- '''Catastro web services parametrized'''
- URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
-
- URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
- URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
-
'''Information to scrap from HTML'''
description_field_names = [u'Referencia catastral', u'Localización', u'Clase', u'Uso principal',
u'Superficie construida', u'Año construcción']
+
gsurface_field_names = [u'Superficie gráfica']
- """ Scrapping calls """
-
+ """ Processing """
@classmethod
- def scrap_coord(cls, x, y, pictures=False):
- logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
- url = cls.URL.format(x, y)
- logger.debug("URL for coordinates: {}".format(url))
- f = urlopen(url)
- data = f.read()
+ def process_search_by_coordinates(cls, x, y, pictures=False):
+ data = ScrapperHTML.scrap_coord(x, y)
+
root = ElementTree.fromstring(data)
pc1 = root.find(
"{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
@@ -55,17 +45,19 @@ class ScrapperHTML(Scrapper):
results = []
if pc1 is not None and pc2 is not None:
cadaster = ''.join([pc1.text, pc2.text])
- cadaster_entries = cls.scrap_cadaster(cadaster, None, None, x, y, pictures)
- for cadaster_entry in cadaster_entries:
+ htmls = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures)
+ for html, picture in htmls.items():
+ cadaster_entry = cls.parse_html_parcela(html, x, y, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
return results
@classmethod
- def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
+ def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
- for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
+ num = ''
+ for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap():
continue
@@ -74,7 +66,7 @@ class ScrapperHTML(Scrapper):
counter = 1
while num_scrapping_fails > 0:
try:
- numerero_map = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
+ numerero_map = Scrapper.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
num_scrapping_fails -= 1
else:
@@ -98,7 +90,7 @@ class ScrapperHTML(Scrapper):
cadaster_num = nump.pc.pc1 + nump.pc.pc2
- coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
+ coords_map = Scrapper.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap():
@@ -113,13 +105,13 @@ class ScrapperHTML(Scrapper):
num_scrapping_fails = 10
- cadaster_list = cls.scrap_cadaster(cadaster_num, prov_num, city_num, lon, lat, pictures)
+ htmls = ScrapperHTML.scrap_cadaster(cadaster_num, prov_num, city_num, pictures)
- for cadaster in cadaster_list:
- cadaster.to_elasticsearch()
+ for html, picture in htmls:
+ cadaster_entry = cls.parse_html_parcela(html, lon, lat, picture)
+ cadaster_entry.to_elasticsearch()
counter += 1
- sleep(config['sleep_time'])
except urllib.error.HTTPError as e:
logger.error(
@@ -141,72 +133,6 @@ class ScrapperHTML(Scrapper):
num_scrapping_fails -= 1
counter += 1
- sleep(config['sleep_time'])
-
-
- @classmethod
- def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio, x=None, y=None, picture=None):
- url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
- logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
- f_ref = urlopen(url_ref)
- data_ref = f_ref.read()
- html = str(data_ref.decode('utf-8'))
- parsed_html = BeautifulSoup(html, features="html.parser")
- return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
-
- @classmethod
- def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
- rc_1 = cadaster[0:7]
- rc_2 = cadaster[7:14]
- url_ref = cls.URL_REF.format(rc_1, rc_2)
-
- logger.debug("URL for cadastral data: {}".format(url_ref))
-
- f_ref = urlopen(url_ref)
- data_ref = f_ref.read()
- html = str(data_ref.decode('utf-8'))
- parsed_html = BeautifulSoup(html, features="html.parser")
-
- if delimitacion is None:
- delimitacion_search = re.search(r'del=([0-9]+)&', html)
- if delimitacion_search:
- delimitacion = delimitacion_search.group(1)
-
- if municipio is None:
- municipio_search = re.search(r'mun=([0-9]+)&', html)
- if municipio_search:
- municipio = municipio_search.group(1)
-
- picture = None
- if pictures:
- picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
- sleep(config['sleep_time'])
-
- description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
-
- cadasters = []
- if description is None:
- logger.debug("Multiparcela found!")
- ''' Multiparcela with multiple cadasters '''
-
- all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
- logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
- for partial_cadaster in all_cadasters:
- partial_cadaster_ref = partial_cadaster.find("b")
- logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
- partial_cadaster_text = partial_cadaster_ref.text.strip()
- cadaster = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y,
- picture)
- cadasters.append(cadaster)
- sleep(config['sleep_time'])
-
- else:
- cadaster = ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
-
- cadasters.append(cadaster)
-
- sleep(config['sleep_time'])
- return cadasters
""" Parsing """
@classmethod
@@ -260,5 +186,6 @@ class ScrapperHTML(Scrapper):
dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text,
superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
+ descriptive_data[u'GráficoParcela']=picture
cadaster_entry = CadasterEntryHTML(descriptive_data)
return cadaster_entry
diff --git a/src/librecatastro/scrapping/format/scrapper_xml.py b/src/librecatastro/scrapping/parsers/parser_xml.py
similarity index 57%
rename from src/librecatastro/scrapping/format/scrapper_xml.py
rename to src/librecatastro/scrapping/parsers/parser_xml.py
index ae1c5e3..af79a9e 100644
--- a/src/librecatastro/scrapping/format/scrapper_xml.py
+++ b/src/librecatastro/scrapping/parsers/parser_xml.py
@@ -4,14 +4,13 @@
import urllib.parse
from urllib import error
-from time import sleep
-
import requests
import xmltodict
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
+from src.librecatastro.scrapping.parser import Parser
from src.librecatastro.scrapping.scrapper import Scrapper
-from src.settings import config
+from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap
@@ -20,28 +19,20 @@ from dotmap import DotMap
logger = CadastroLogger(__name__).logger
-class ScrapperXML(Scrapper):
- """Scrapper class for Catastro XML"""
+class ParserXML(Parser):
+ """Parser class for Catastro XML"""
def __init__(self):
super().__init__()
- """ Scrapping main calls """
-
+ ''' Processing calls '''
@classmethod
- def scrap_coord(cls, x, y, pictures=False):
+ def process_search_by_coordinates(cls, x, y, pictures=False):
"""Scraps properties by coordinates"""
results = []
- params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
- url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
- response = requests.get(url, params=params)
-
- logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
- logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
-
- xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
+ xml_dict_map = ScrapperXML.get_coord(x, y)
pc1 = None
pc2 = None
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
@@ -55,7 +46,7 @@ class ScrapperXML(Scrapper):
if pc1 is not None and pc2 is not None:
- entry = cls.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
+ entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', ''.join([pc1, pc2]))
picture = None
if entry.consulta_dnp.bico.bi.dt.loine != DotMap():
# Parcela
@@ -63,42 +54,78 @@ class ScrapperXML(Scrapper):
prov_num = entry.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry.consulta_dnp.bico.bi.dt.cmc
if prov_num != DotMap() and city_num != DotMap():
- picture = cls.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
- cadaster_entry = CadasterEntryXML.create_from_bico(entry, x, y, picture)
+ picture = Scrapper.scrap_site_picture(prov_num, city_num, ''.join([pc1, pc2]))
+ cadaster_entry = CadasterEntryXML(entry, x, y, picture)
cadaster_entry.to_elasticsearch()
- sleep(config['sleep_time'])
results.append(cadaster_entry)
elif entry.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
parcelas = entry.consulta_dnp.lrcdnp.rcdnp
if not isinstance(parcelas, list):
parcelas = [parcelas]
+
for parcela in parcelas:
+ prov_num = parcela.dt.loine.cp
+ city_num = parcela.dt.cmc
+
cadaster = parcela.rc.pc1 if parcela.rc.pc1 != DotMap() else ''
cadaster += parcela.rc.pc2 if parcela.rc.pc2 != DotMap() else ''
cadaster += parcela.rc.car if parcela.rc.car != DotMap() else ''
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
- if pictures:
- prov_num = parcela.dt.loine.cp
- city_num = parcela.dt.cmc
- if prov_num != DotMap() and city_num != DotMap():
- picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
+ if pictures and prov_num != DotMap() and city_num != DotMap():
+ picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
- parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
- cadaster_entry = CadasterEntryXML(parcela, x, y, picture)
+ try:
+ # Try to get info by complete cadaster num
+ sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
+ except:
+ # Cadastro did not return anything by cadaster entry (error? bug?)
+ # Try to get it by complete address
+ prov_name = parcela.dt.np
+ if prov_name is DotMap():
+ continue
+ city_name = parcela.dt.np
+ if city_name is DotMap():
+ continue
+ tv = parcela.ldt.locs.lous.lourb.dir.tv
+ if tv is DotMap():
+ tv = ''
+ nv = parcela.ldt.locs.lous.lourb.dir.nv
+ if nv is DotMap():
+ nv = ''
+ num = parcela.ldt.locs.lous.lourb.dir.pnp
+ if num is DotMap():
+ num = ''
+
+ loint = parcela.dt.locs.lous.lourb.loint
+ if loint is DotMap():
+ continue
+ bl = loint.bl
+ if bl == DotMap():
+ bl = ''
+ es = loint.es
+ if es == DotMap():
+ es = ''
+ pt = loint.pt
+ if es == DotMap():
+ pt = ''
+ pu = loint.pu
+ if es == DotMap():
+ pu = ''
+ sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
+
+ cadaster_entry = CadasterEntryXML(sub_entry, x, y, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
-
- sleep(config['sleep_time'])
return results
@classmethod
- def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
- for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
+ def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
+ for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap():
continue
@@ -106,13 +133,12 @@ class ScrapperXML(Scrapper):
counter = 1
while num_scrapping_fails > 0:
try:
- cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
+ cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
if len(res) < 1:
num_scrapping_fails -= 1
else:
num_scrapping_fails = 10
- sleep(config['sleep_time'])
except urllib.error.HTTPError as e:
logger.error(
@@ -123,7 +149,6 @@ class ScrapperXML(Scrapper):
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
- sleep(config['sleep_dos_time'])
except Exception as e:
logger.error(
@@ -134,7 +159,8 @@ class ScrapperXML(Scrapper):
num_scrapping_fails -= 1
counter += 1
- sleep(config['sleep_time'])
+
+ ''' Parsing calls '''
@classmethod
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
@@ -161,7 +187,7 @@ class ScrapperXML(Scrapper):
cadaster_num = nump.pc.pc1 + nump.pc.pc2
- coords_map = cls.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
+ coords_map = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
lon = coords_map.consulta_coordenadas.coordenadas.coord.geo.xcen
if lon == DotMap():
lon = None
@@ -173,7 +199,7 @@ class ScrapperXML(Scrapper):
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
- entry_map = cls.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
+ entry_map = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
picture = None
if entry_map.consulta_dnp.bico != DotMap():
@@ -181,14 +207,13 @@ class ScrapperXML(Scrapper):
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
if pictures and prov_num != DotMap() and city_num != DotMap():
- picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
+ picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster_num)
# Parcela
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
results.append(cadaster_entry)
cadaster_entry.to_elasticsearch()
- sleep(config['sleep_time'])
elif entry_map.consulta_dnp.lrcdnp.rcdnp != DotMap():
# Multiparcela
for site in entry_map.consulta_dnp.lrcdnp.rcdnp:
@@ -208,18 +233,38 @@ class ScrapperXML(Scrapper):
cadaster += parcela.rc.cc1 if parcela.rc.cc1 != DotMap() else ''
cadaster += parcela.rc.cc2 if parcela.rc.cc2 != DotMap() else ''
- if pictures:
- prov_num = parcela.dt.loine.cp
- city_num = parcela.dt.cmc
- if prov_num != DotMap() and city_num != DotMap():
- picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
+ prov_num = parcela.dt.loine.cp
+ city_num = parcela.dt.cmc
- parcela = cls.get_cadaster_entries_by_cadaster('', '', cadaster)
- cadaster_entry = CadasterEntryXML(parcela, lon, lat, picture)
+ if pictures and prov_num != DotMap() and city_num != DotMap():
+ picture = Scrapper.scrap_site_picture(prov_num, city_num, cadaster)
+
+ try:
+ # Try to get info by complete cadaster num
+ sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_num, city_num, cadaster)
+ except:
+ # Cadastro did not return anything by cadaster entry (error? bug?)
+ # Try to get it by complete address
+ loint = parcela.dt.locs.lous.lourb.loint
+ if loint is DotMap():
+ continue
+ bl = loint.bl
+ if bl == DotMap():
+ bl = ''
+ es = loint.es
+ if es == DotMap():
+ es = ''
+ pt = loint.pt
+ if es == DotMap():
+ pt = ''
+ pu = loint.pu
+ if es == DotMap():
+ pu = ''
+ sub_entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num, bl, es, pt, pu)
+
+ cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
- sleep(config['sleep_time'])
-
return results
diff --git a/src/librecatastro/scrapping/scrapper.py b/src/librecatastro/scrapping/scrapper.py
index e9a4aa5..82c820c 100644
--- a/src/librecatastro/scrapping/scrapper.py
+++ b/src/librecatastro/scrapping/scrapper.py
@@ -1,9 +1,7 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
import base64
-import urllib.parse
+from time import sleep
from urllib.request import urlopen
+import urllib.parse
import requests
import xmltodict
@@ -17,29 +15,21 @@ logger = CadastroLogger(__name__).logger
class Scrapper:
- """Generic Scrapper class"""
-
- '''Catastro web services parametrized'''
- URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
+ """Catastro web services parametrized"""
URL_PICTURES = "https://www1.sedecatastro.gob.es/Cartografia/GeneraGraficoParcela.aspx?del={}&mun={}&refcat={}&AnchoPixels={}&AltoPixels={}"
+ URL_LOCATIONS_BASE = "http://ovc.catastro.meh.es/ovcservweb/OVCSWLocalizacionRC{}"
def __init__(self):
pass
- @classmethod
- def scrap_coords(cls, x, y, pictures=False):
- pass
-
- @classmethod
- def scrap_provinces(cls, prov_list, pictures=False):
- pass
-
@classmethod
def get_provinces(cls):
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
response = requests.get(url)
xml = response.content
+
+ sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
@@ -52,6 +42,8 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaMunicipio")
response = requests.get(url, params=params)
xml = response.content
+
+ sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
@@ -70,6 +62,8 @@ class Scrapper:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaVia")
response = requests.get(url, params=params)
xml = response.content
+
+ sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
@@ -125,6 +119,22 @@ class Scrapper:
else:
yield (prov_name, prov_num, city_name, city_num, address_dir, tv, nv)
+ @classmethod
+ def scrap_site_picture(cls, prov_num, city_num, cadaster):
+
+ url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
+
+ logger.debug("URL for picture data: {}".format(url_pic))
+
+ f_pic = urlopen(url_pic)
+
+ data_ref = f_pic.read()
+
+ b64_image = base64.b64encode(data_ref).decode('utf-8')
+
+ sleep(config['sleep_time'])
+ return b64_image
+
@classmethod
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
params = {'Provincia': provincia,
@@ -140,77 +150,20 @@ class Scrapper:
response = requests.get(url, params=params)
xml = response.content
- return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
- @classmethod
- def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
- planta=None,puerta=None):
- params = {'Provincia': provincia,
- 'Municipio': municipio,
- 'Sigla': sigla,
- 'Calle': calle,
- 'Numero': str(numero)}
- if bloque:
- params['Bloque'] = str(bloque)
- else:
- params['Bloque'] = ''
- if escalera:
- params['Escalera'] = escalera
- else:
- params['Escalera'] = ''
- if planta:
- params['Planta'] = str(planta)
- else:
- params['Planta'] = ''
- if puerta:
- params['Puerta'] = str(puerta)
- else:
- params['Puerta'] = ''
-
- url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
- logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
-
- response = requests.get(url, params=params)
- xml = response.content
-
- return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
-
- @classmethod
- def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
- """ provincia and municipio are optional and can be set to ''"""
- params = {"Provincia": provincia,
- "Municipio": municipio,
- "RC": rc}
-
- url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
- logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
- response = requests.get(url, params=params)
- xml = response.content
+ sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
- params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
+ params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4326', 'RC': cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
- logger.debug("URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
+ logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
+
+ sleep(config['sleep_time'])
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
- @classmethod
- def scrap_site_picture(cls, prov_num, city_num, cadaster):
-
- url_pic = cls.URL_PICTURES.format(prov_num, city_num, cadaster, config['width_px'], config['height_px'])
-
- logger.debug("URL for picture data: {}".format(url_pic))
-
- f_pic = urlopen(url_pic)
-
- data_ref = f_pic.read()
-
- b64_image = base64.b64encode(data_ref).decode('utf-8')
-
- return b64_image
-
diff --git a/src/templates/__init__.py b/src/librecatastro/scrapping/scrappers/__init__.py
similarity index 100%
rename from src/templates/__init__.py
rename to src/librecatastro/scrapping/scrappers/__init__.py
diff --git a/src/librecatastro/scrapping/scrappers/scrapper_html.py b/src/librecatastro/scrapping/scrappers/scrapper_html.py
new file mode 100644
index 0000000..d8ac481
--- /dev/null
+++ b/src/librecatastro/scrapping/scrappers/scrapper_html.py
@@ -0,0 +1,99 @@
+import re
+from time import sleep
+from urllib.request import urlopen
+
+from bs4 import BeautifulSoup
+
+from src.librecatastro.scrapping.scrapper import Scrapper
+from src.settings import config
+from src.utils.cadastro_logger import CadastroLogger
+
+'''Logger'''
+logger = CadastroLogger(__name__).logger
+
+
+class ScrapperHTML(Scrapper):
+ """HTML Catastro Scrapper"""
+
+ URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}"
+ URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
+ URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
+
+ def __init__(self):
+ super().__init__()
+
+ @classmethod
+ def scrap_coord(cls, x, y):
+ logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
+ url = cls.URL.format(x, y)
+ logger.debug("URL for coordinates: {}".format(url))
+ f = urlopen(url)
+
+ sleep(config['sleep_time'])
+ return f.read()
+
+ @classmethod
+ def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
+ url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
+ logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
+ f_ref = urlopen(url_ref)
+ data_ref = f_ref.read()
+ html = str(data_ref.decode('utf-8'))
+ parsed_html = BeautifulSoup(html, features="html.parser")
+
+ sleep(config['sleep_time'])
+ return parsed_html
+
+ @classmethod
+ def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
+ rc_1 = cadaster[0:7]
+ rc_2 = cadaster[7:14]
+ url_ref = cls.URL_REF.format(rc_1, rc_2)
+
+ logger.debug("URL for cadastral data: {}".format(url_ref))
+
+ f_ref = urlopen(url_ref)
+ data_ref = f_ref.read()
+ sleep(config['sleep_time'])
+
+ html = str(data_ref.decode('utf-8'))
+ parsed_html = BeautifulSoup(html, features="html.parser")
+
+ if delimitacion is None:
+ delimitacion_search = re.search(r'del=([0-9]+)&', html)
+ if delimitacion_search:
+ delimitacion = delimitacion_search.group(1)
+
+ if municipio is None:
+ municipio_search = re.search(r'mun=([0-9]+)&', html)
+ if municipio_search:
+ municipio = municipio_search.group(1)
+
+ description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
+
+ picture = None
+ if pictures:
+ picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
+ sleep(config['sleep_time'])
+
+ htmls = []
+ if description is None:
+ # Multiparcela
+ logger.debug("Multiparcela found!")
+ ''' Multiparcela with multiple cadasters '''
+
+ all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
+ logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
+ for partial_cadaster in all_cadasters:
+ partial_cadaster_ref = partial_cadaster.find("b")
+ logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
+ partial_cadaster_text = partial_cadaster_ref.text.strip()
+ html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
+ htmls.append((html, picture))
+ sleep(config['sleep_time'])
+ else:
+ # Parcela
+ htmls.append((html, picture))
+
+ return htmls
+
diff --git a/src/librecatastro/scrapping/scrappers/scrapper_xml.py b/src/librecatastro/scrapping/scrappers/scrapper_xml.py
new file mode 100644
index 0000000..4fc66e4
--- /dev/null
+++ b/src/librecatastro/scrapping/scrappers/scrapper_xml.py
@@ -0,0 +1,83 @@
+import urllib.parse
+from time import sleep
+
+import requests
+import xmltodict
+from dotmap import DotMap
+
+from src.librecatastro.scrapping.scrapper import Scrapper
+from src.settings import config
+from src.utils.cadastro_logger import CadastroLogger
+
+'''Logger'''
+logger = CadastroLogger(__name__).logger
+
+
+class ScrapperXML(Scrapper):
+
+ def __init__(self):
+ super().__init__()
+
+ @classmethod
+ def get_coord(cls,x, y):
+ params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
+ url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
+ response = requests.get(url, params=params)
+
+ logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
+ logger.debug("URL for coordinates: {}".format(url + '?' + urllib.parse.urlencode(params)))
+
+ xml_dict_map = DotMap(xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False))
+
+ sleep(config['sleep_time'])
+ return xml_dict_map
+
+ @classmethod
+ def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
+ """ provincia and municipio are optional and can be set to '' """
+
+ params = {"Provincia": provincia,
+ "Municipio": municipio,
+ "RC": rc}
+
+ url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
+ logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
+ response = requests.get(url, params=params)
+ xml = response.content
+
+ sleep(config['sleep_time'])
+ return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
+
+ @classmethod
+ def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
+ planta=None,puerta=None):
+ params = {'Provincia': provincia,
+ 'Municipio': municipio,
+ 'Sigla': sigla,
+ 'Calle': calle,
+ 'Numero': str(numero)}
+ if bloque:
+ params['Bloque'] = str(bloque)
+ else:
+ params['Bloque'] = ''
+ if escalera:
+ params['Escalera'] = escalera
+ else:
+ params['Escalera'] = ''
+ if planta:
+ params['Planta'] = str(planta)
+ else:
+ params['Planta'] = ''
+ if puerta:
+ params['Puerta'] = str(puerta)
+ else:
+ params['Puerta'] = ''
+
+ url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
+ logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
+
+ response = requests.get(url, params=params)
+ xml = response.content
+
+ sleep(config['sleep_time'])
+ return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
\ No newline at end of file
diff --git a/src/librecatastro/scrapping/search.py b/src/librecatastro/scrapping/searcher.py
similarity index 83%
rename from src/librecatastro/scrapping/search.py
rename to src/librecatastro/scrapping/searcher.py
index a071379..b3bfec1 100644
--- a/src/librecatastro/scrapping/search.py
+++ b/src/librecatastro/scrapping/searcher.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-class Search:
+
+class Searcher:
def __init__(self):
pass
diff --git a/src/librecatastro/scrapping/searchers/coordinates_search.py b/src/librecatastro/scrapping/searchers/coordinates_searcher.py
similarity index 87%
rename from src/librecatastro/scrapping/searchers/coordinates_search.py
rename to src/librecatastro/scrapping/searchers/coordinates_searcher.py
index 0999e27..cf57354 100644
--- a/src/librecatastro/scrapping/searchers/coordinates_search.py
+++ b/src/librecatastro/scrapping/searchers/coordinates_searcher.py
@@ -8,7 +8,7 @@ import random
from time import sleep
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
-from src.librecatastro.scrapping.search import Search
+from src.librecatastro.scrapping.searcher import Searcher
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.list_utils import ListUtils
@@ -17,12 +17,12 @@ from src.utils.list_utils import ListUtils
logger = CadastroLogger(__name__).logger
-class CoordinatesSearch(Search):
+class CoordinatesSearcher(Searcher):
def __init__(self):
super().__init__()
@classmethod
- def scrap_coordinates(cls, scrapper, filenames, pictures=False):
+ def search_by_coordinates(cls, scrapper, filenames, pictures=False):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
@@ -34,12 +34,12 @@ class CoordinatesSearch(Search):
try:
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
- CoordinatesSearch.scrap_polygon(scrapper, polygon, pictures)
+ CoordinatesSearcher.search_in_polygon(scrapper, polygon, pictures)
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
@classmethod
- def scrap_polygon(cls, scrapper, polygon, pictures=False):
+ def search_in_polygon(cls, scrapper, polygon, pictures=False):
bb = polygon.get_bounding_box()
lon_min = int(bb[0] * config['scale'])
lon_max = int(bb[2] * config['scale'])
@@ -57,7 +57,7 @@ class CoordinatesSearch(Search):
logger.info('{},{}'.format(x_scaled, y_scaled))
try:
- scrapper.scrap_coord(x_scaled, y_scaled, pictures)
+ scrapper.process_search_by_coordinates(x_scaled, y_scaled, pictures)
except urllib.error.HTTPError as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
@@ -76,7 +76,7 @@ class CoordinatesSearch(Search):
sleep(config['sleep_time'])
@staticmethod
- def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
+ def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
start_time = time.time()
results = []
@@ -88,7 +88,7 @@ class CoordinatesSearch(Search):
y_scaled = y / config['scale']
try:
- result = scrapper.scrap_coord(x_scaled, y_scaled)
+ result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if result is not None:
results.append(result)
@@ -117,9 +117,9 @@ class CoordinatesSearch(Search):
return ListUtils.flat(results)
@staticmethod
- def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
+ def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
results = []
- counter = times
+ counter = matches
finished = False
for x in range(lon_min, lon_max):
@@ -130,7 +130,7 @@ class CoordinatesSearch(Search):
try:
- result = scrapper.scrap_coord(x_scaled, y_scaled)
+ result = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if result is not None:
results.append(result)
@@ -159,7 +159,7 @@ class CoordinatesSearch(Search):
return ListUtils.flat(results)
@staticmethod
- def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max, scrapper):
+ def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, scrapper):
results = []
counter = times
while counter > 0:
@@ -170,7 +170,7 @@ class CoordinatesSearch(Search):
y_scaled = y / config['scale']
try:
- cadaster_entry = scrapper.scrap_coord(x_scaled, y_scaled)
+ cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if len(cadaster_entry) > 0:
results.append(cadaster_entry)
diff --git a/src/librecatastro/scrapping/searchers/provinces_search.py b/src/librecatastro/scrapping/searchers/provinces_search.py
deleted file mode 100644
index d335b94..0000000
--- a/src/librecatastro/scrapping/searchers/provinces_search.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from dotmap import DotMap
-
-from src.librecatastro.scrapping.scrapper import Scrapper
-from src.librecatastro.scrapping.search import Search
-from src.utils.cadastro_logger import CadastroLogger
-
-'''Logger'''
-logger = CadastroLogger(__name__).logger
-
-
-class ProvincesSearch(Search):
- def __init__(self):
- super().__init__()
-
- @classmethod
- def scrap_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
- scrapper.scrap_provinces(prov_list, pictures, start_from)
-
- @classmethod
- def list_provinces(cls):
- logger.debug(DotMap.pprint(Scrapper.get_provinces()))
- return
-
- @classmethod
- def list_cities(cls, prov_name):
- logger.debug(DotMap.pprint(Scrapper.get_cities(prov_name)))
- return
diff --git a/src/librecatastro/scrapping/searchers/provinces_searcher.py b/src/librecatastro/scrapping/searchers/provinces_searcher.py
new file mode 100644
index 0000000..1bf1200
--- /dev/null
+++ b/src/librecatastro/scrapping/searchers/provinces_searcher.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from dotmap import DotMap
+
+from src.librecatastro.scrapping.scrapper import Scrapper
+from src.librecatastro.scrapping.searcher import Searcher
+from src.utils.cadastro_logger import CadastroLogger
+
+'''Logger'''
+logger = CadastroLogger(__name__).logger
+
+
+class ProvincesSearcher(Searcher):
+ def __init__(self):
+ super().__init__()
+
+ @classmethod
+ def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
+ scrapper.process_search_by_provinces(prov_list, pictures, start_from)
+
+ @classmethod
+ def list_provinces(cls):
+ dotmap = Scrapper.get_provinces()
+ provinces = dotmap.consulta_provinciero.provinciero.prov
+ for province in provinces:
+ logger.debug(province.np)
+
+ @classmethod
+ def list_cities(cls, prov_name):
+ dotmap = Scrapper.get_cities(prov_name)
+ cities = dotmap.consulta_municipiero.municipiero.muni
+ for city in cities:
+ logger.debug(city.nm)
+ return
diff --git a/src/settings.py b/src/settings.py
index 1444c06..b4c660a 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -16,5 +16,9 @@ config = {
"sleep_time": 5,
"sleep_dos_time": 300,
"width_px": 120,
- "height_px": 120
+ "height_px": 120,
+ "servers_down_message": "Some of the Cadastro servers are down. "
+ "Maintenance is usually carried out durign the night or the weekends. Please, retry later."
+ "As an alternative, your IP address may have been banned. Try to change your public IP"
+
}
diff --git a/src/templates/individual_address.xml b/src/templates/individual_address.xml
deleted file mode 100644
index cd8d8bc..0000000
--- a/src/templates/individual_address.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
- ####ADDRESS####
-
-
\ No newline at end of file
diff --git a/src/templates/individual_cadaster.xml b/src/templates/individual_cadaster.xml
deleted file mode 100644
index 90d502f..0000000
--- a/src/templates/individual_cadaster.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
- ####CADASTER####
-
\ No newline at end of file
diff --git a/src/templates/individual_city.xml b/src/templates/individual_city.xml
deleted file mode 100644
index 800f280..0000000
--- a/src/templates/individual_city.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
- ####CITY####
-
-
\ No newline at end of file
diff --git a/src/templates/individual_coord.xml b/src/templates/individual_coord.xml
deleted file mode 100644
index b0cc4df..0000000
--- a/src/templates/individual_coord.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
- ####COORDINATES####
-
-
\ No newline at end of file
diff --git a/src/templates/individual_province.xml b/src/templates/individual_province.xml
deleted file mode 100644
index 9a2b92a..0000000
--- a/src/templates/individual_province.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
- ####PROVINCE####
-
-
\ No newline at end of file
diff --git a/src/templates/ontology.owl b/src/templates/ontology.owl
deleted file mode 100644
index f2c5dd0..0000000
--- a/src/templates/ontology.owl
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
- Thing
-
-
-
-
- CadasterEntry
-
-
-
-
- Address
-
-
-
-
- Province
-
-
-
-
- City
-
-
-
-
- Geographical Coordinates
-
-
-
-
-
- ####INDIVIDUALS####
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/src/tests/scrappers/__init__.py b/src/tests/scrappers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tests/scrapper_html_tests.py b/src/tests/scrappers/scrapper_html_tests.py
similarity index 90%
rename from src/tests/scrapper_html_tests.py
rename to src/tests/scrappers/scrapper_html_tests.py
index 1714b6f..198f7b5 100644
--- a/src/tests/scrapper_html_tests.py
+++ b/src/tests/scrappers/scrapper_html_tests.py
@@ -5,8 +5,8 @@ import os
import unittest
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
-from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
-from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
+from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
+from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
@@ -22,17 +22,17 @@ class ScrapperHTMLTests(unittest.TestCase):
assert True
def test_coordinate_creates_cadaster(self):
- cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47)
+ cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
def test_coordinate_multiparcela_creates_cadaster(self):
- cadaster_list = ScrapperHTML.scrap_coord(-0.33, 39.47)
+ cadaster_list = ScrapperHTML.parse_coord(-0.33, 39.47)
self.assertTrue(len(cadaster_list) > 1)
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
- cadaster_list = ScrapperHTML.scrap_coord(-3.68, 40.47)
+ cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
cadaster.to_elasticsearch()
@@ -92,7 +92,7 @@ class ScrapperHTMLTests(unittest.TestCase):
def scrap_random_until_x_times_found(self, times):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box()
- cadaster_list = CoordinatesSearch.scrap_results_random_x_times(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
+ cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
diff --git a/src/tests/scrapper_xml_tests.py b/src/tests/scrappers/scrapper_xml_tests.py
similarity index 62%
rename from src/tests/scrapper_xml_tests.py
rename to src/tests/scrappers/scrapper_xml_tests.py
index fca7405..33fa3d0 100644
--- a/src/tests/scrapper_xml_tests.py
+++ b/src/tests/scrappers/scrapper_xml_tests.py
@@ -5,44 +5,48 @@ import unittest
from time import sleep
-from dotmap import DotMap
-
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
-from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
+from src.librecatastro.scrapping.parsers.parser_xml import ScrapperXML, ParserXML
+from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.settings import config
class ScrapperXMLTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self):
- self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
- sleep(config['sleep_time'])
+ try:
+ self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
+ except:
+ self.assertFalse(config['servers_down_message'])
+ exit(-1)
def test_scrapper_retrieves_dict_cities(self):
- self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
- sleep(config['sleep_time'])
+ try:
+ self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
+ except:
+ self.assertFalse(config['servers_down_message'])
+ exit(-1)
def test_scrapper_retrieves_dict_addresses(self):
- self.assertEqual(ScrapperXML.get_addresses('ALACANT','AGOST').consulta_callejero.control.cuca, '117')
- sleep(config['sleep_time'])
+ try:
+ self.assertEqual(ScrapperXML.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117')
+ except:
+ self.assertFalse(config['servers_down_message'])
+ exit(-1)
+
+ def test_get_cadaster_entries_by_cadaster_is_up(self):
+ cadasters = ['2503906VK4820D0001MX']
+ try:
+ for cadaster in cadasters:
+ ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster)
+ except:
+ self.assertFalse(config['servers_down_message'])
+ exit(-1)
def test_scrapper_retrieves_dict_addresses_iter(self):
iterator = ScrapperXML.get_address_iter()
address = iterator.__next__()
self.assertEqual(address[1], '15')
self.assertEqual(address[3], '7')
- sleep(config['sleep_time'])
-
- def test_scrapper_creates_cadaster_entry(self):
- dotmap_res = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
- self.assertNotEqual(dotmap_res, DotMap())
- sleep(config['sleep_time'])
-
- def test_scrapper_creates_cadaster_entry_and_stores_in_elasticsearch(self):
- entry = ScrapperXML.get_cadaster_entries_by_cadaster('', '', '6375620YH0567S0001GW')
- cadaster_entry = CadasterEntryXML(entry)
- cadaster_entry.to_elasticsearch()
- self.assertIsNotNone(cadaster_entry.from_elasticsearch())
- sleep(config['sleep_time'])
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
prov_name = u'A CORUÑA'
@@ -127,7 +131,7 @@ class ScrapperXMLTests(unittest.TestCase):
def test_multiparcela_coord_creates_n_entries(self):
lon = -9.2503
lat = 42.9723
- self.assertEqual(len(ScrapperXML.scrap_coord(lon, lat, True)), 2)
+ self.assertEqual(len(ParserXML.process_search_by_coordinates(lon, lat, True)), 2)
def test_multiparcela_address_creates_n_entries(self):
prov_name = u'MADRID'
@@ -136,7 +140,7 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CANARIAS'
num = 7
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
- self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
+ self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
def test_multiparcela_address_creates_n_entries_2(self):
prov_name = u'MADRID'
@@ -145,7 +149,39 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CALVARIO'
num = 38
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
- self.assertEqual(len(ScrapperXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
+ self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
+
+ def test_poligono_or_rural_creates_entry(self):
+ tv = 'CL'
+ nv = 'TORREJON'
+ num = 30
+ prov_name = 'MADRID'
+ city_name = 'AJALVIR'
+ cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
+ self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
+
+ def test_coordinates_are_in_good_format(self):
+ tv = 'CL'
+ nv = 'DE BENICARLO'
+ num = 1
+ prov_name = 'MADRID'
+ city_name = 'GALAPAGAR'
+ xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
+ cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
+ self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374)
+ self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211)
+
+ def test_multiparcela_coordinates_are_in_good_format(self):
+ tv = 'CL'
+ nv = 'SAN VICENTE'
+ num = 26
+ prov_name = 'ALACANT'
+ city_name = 'ALICANTE/ALACANT'
+ xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
+ cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
+ for cadaster_entry in cadaster_entries:
+ self.assertEqual(cadaster_entry.location.lat, 38.3495195831056)
+ self.assertEqual(cadaster_entry.location.lon, -0.484612452235845)
if __name__ == '__main__':
diff --git a/src/utils/elasticsearch_utils.py b/src/utils/elasticsearch_utils.py
index 1d8924e..d930c52 100644
--- a/src/utils/elasticsearch_utils.py
+++ b/src/utils/elasticsearch_utils.py
@@ -6,7 +6,7 @@ logger = CadastroLogger(__name__).logger
class ElasticSearchUtils:
- """Custom class for managing Elastic Search queries"""
+ """Custom class for managing Elastic Searcher queries"""
def __init__(self):
pass
diff --git a/src/utils/ontology_converter.py b/src/utils/ontology_converter.py
deleted file mode 100644
index 95128d7..0000000
--- a/src/utils/ontology_converter.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import copy
-import re
-
-
-class OntologyConverter:
-
- def __init__(self):
-
- with open("../templates/ontology.owl") as ont_f, \
- open("../templates/individual_city.xml") as ind_city_f, \
- open("../templates/individual_province.xml") as ind_province_f, \
- open("../templates/individual_coord.xml") as ind_coord_f, \
- open("../templates/individual_address.xml") as ind_address_f, \
- open("../templates/individual_cadaster.xml") as ind_cadaster_f:
-
- self.ont_template = ont_f.read()
- self.city_template = ind_city_f.read()
- self.province_template = ind_province_f.read()
- self.coord_template = ind_coord_f.read()
- self.address_template = ind_address_f.read()
- self.cadaster_template = ind_cadaster_f.read()
-
- def cadastro_dict_to_ontology(self, cadastro_list):
-
- ont = copy.deepcopy(self.ont_template)
-
- for cadastro_entry in cadastro_list:
- ont = ont.replace("####INDIVIDUALS####", ''.join(["####INDIVIDUALS####",
- self.instantiate_individual(cadastro_entry)]))
-
- ont = ont.replace("####INDIVIDUALS####", '')
-
- return ont
-
- def instantiate_individual(self, cadastro_entry):
- individuals = ''
-
- cadaster = ''
- for header, value in cadastro_entry.items():
- if header == 'Referencia catastral':
- txt = copy.deepcopy(self.cadaster_template)
- txt = txt.replace("####CADASTER####", value)
- individuals = ''.join([individuals, txt])
- cadaster = value
- elif header == 'Localización':
- city_txt = copy.deepcopy(self.city_template)
- province_txt = copy.deepcopy(self.province_template)
- address_txt = copy.deepcopy(self.address_template)
-
- cp = re.search(r'[0-9]{5}', value)
- cp_span = cp.span()
- cp_span_end = cp_span[1]
-
- city_text = value[cp_span_end:]
- province = re.search(r'\(([^\)]+)\)', city_text)
- province_span = province.span()
- province_start = province_span[0]
- province_end = province_span[1]
- province_text = value[province_start:province_end]
-
- province_txt = province_txt.replace("####CADASTER####", cadaster)
- province_txt = province_txt.replace("####PROVINCE####", province_text)
-
- city_txt = city_txt.replace("####CITY####", city_text)
- city_txt = city_txt.replace("####PROVINCE####", province_text)
-
- address_txt = address_txt.replace("####ADDRESS####", value)
- address_txt = address_txt.replace("####CITY####", city_text)
-
- individuals = ''.join([individuals, province_txt, city_txt, address_txt])
-
- #print(individuals)
- return individuals
-
-
-
-