diff --git a/src/librecatastro/catastro_scrapper.py b/src/librecatastro/catastro_scrapper.py index 2c1bd4d..c287c7f 100644 --- a/src/librecatastro/catastro_scrapper.py +++ b/src/librecatastro/catastro_scrapper.py @@ -1,48 +1,90 @@ import random import re +import time + from time import sleep from urllib.request import urlopen from xml.etree import ElementTree from bs4 import BeautifulSoup -from src.librecatastro.domain.cadaster import Cadaster +from src.librecatastro.domain.cadaster_entry import CadasterEntry from src.settings import config -from src.utils.ontology_converter import OntologyConverter -'''Constants''' +from src.utils.cadastro_logger import CadastroLogger +from src.utils.list_utils import ListUtils +"""Constants""" + +'''Spain geocoordinates''' LONGITUDE = (4289603, -18024300) # *1000000 LATITUDE = (43769200, 27725500) # *1000000 +'''Scale for scrapping''' SCALE = 1000000 TRUNCATE_RIGHT = 4 +'''Enumerator for tuple access''' MAX = 0 MIN = 1 +'''Catastro web services parametrized''' URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}" URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}" URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}" +'''Information to scrap from HTML''' field_names = (u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción') +'''Logger''' +logger = CadastroLogger(__name__).logger + class CadastroScrapper: - """Scrapper class for Cadastro Web""" + """Scrapper class for Catastro HTML""" def __init__(self): pass + """ Scrapping main calls """ @staticmethod def scrap_all(): + results = [] for j in range(LONGITUDE[MIN], LONGITUDE[MAX]): for i in range(LATITUDE[MIN], LATITUDE[MAX]): - CadastroScrapper.scrap_coord(i, j) + result = CadastroScrapper.scrap_coord(i, j) + if result is not None: + results.append(result) + return ListUtils.flat(results) @staticmethod - def scrap_results_linear(times): + def scrap_results_by_time(seconds): + start_time = time.time() + results = [] + + finished = False + for j in range(LONGITUDE[MIN], LONGITUDE[MAX]): + for i in range(LATITUDE[MIN], LATITUDE[MAX]): + if finished: + break + result = CadastroScrapper.scrap_coord(i, j) + if result is not None: + results.append(result) + now = time.time() + elapsed_time = now - start_time + if elapsed_time > seconds: + finished = True + break + sleep(5) + if finished: + break + return ListUtils.flat(results) + + @staticmethod + def scrap_results_linear_x_times(times): results = [] counter = times + + finished = False for x in range(LONGITUDE[MIN], LONGITUDE[MAX]): for y in range(LATITUDE[MIN], LATITUDE[MAX]): @@ -55,11 +97,17 @@ class CadastroScrapper: results.append(result) counter -= 1 if counter == 0: - return + finished = True + break sleep(5) + if finished: + break + + return ListUtils.flat(results) + @staticmethod - def scrap_results_random(times): + def scrap_results_random_x_times(times): results = [] counter = times while counter > 0: @@ -81,11 +129,12 @@ class CadastroScrapper: #ontology_converter = OntologyConverter() #print(ontology_converter.cadastro_dict_to_ontology(results)) - print("====PROCESSING FINISHED====") - print("Results found: {}".format(times)) - print(results) - return results + logger.info("====PROCESSING FINISHED====") + logger.info("Results found: {}".format(times)) + logger.info(results) + return ListUtils.flat(results) + """ Scrapping secondary calls """ @staticmethod def parse_html_parcela(parsed_html, x=None, y=None): description = parsed_html.find(id='ctl00_Contenido_tblInmueble') @@ -105,21 +154,16 @@ class CadastroScrapper: descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0] descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0] elif field_header.text == u'Localización': - descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('
', - config[ - 'separator']).replace( - '
', config['separator']) + descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('
',config['separator']).replace('
', config['separator']) - cadaster_entry = Cadaster(descriptive_data) - print(cadaster_entry.to_json()) + cadaster_entry = CadasterEntry(descriptive_data) + logger.info(cadaster_entry.to_json()) return cadaster_entry @staticmethod - def scrap_cadaster_full_code(full_cadaster, x=None, y=None): - delimitacion = full_cadaster[0:2] - municipio = full_cadaster[2:5] + def scrap_cadaster_full_code(full_cadaster, delimitacion, municipio, x=None, y=None): url_ref = URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio) - print("-->FULL URL for cadastral data: {}".format(url_ref)) + logger.info("-->FULL URL for cadastral data: {}".format(url_ref)) f_ref = urlopen(url_ref) data_ref = f_ref.read() html = str(data_ref.decode('utf-8')) @@ -131,34 +175,48 @@ class CadastroScrapper: rc_1 = cadaster[0:7] rc_2 = cadaster[7:14] url_ref = URL_REF.format(rc_1, rc_2) - print("-->URL for cadastral data: {}".format(url_ref)) + logger.info("-->URL for cadastral data: {}".format(url_ref)) f_ref = urlopen(url_ref) data_ref = f_ref.read() html = str(data_ref.decode('utf-8')) parsed_html = BeautifulSoup(html, features="html.parser") + + delimitacion = '' + delimitacion_search = re.search(r'del=([0-9]+)&', html) + if delimitacion_search: + delimitacion = delimitacion_search.group(1) + + municipio = '' + municipio_search = re.search(r'mun=([0-9]+)&', html) + if municipio_search: + municipio = municipio_search.group(1) + description = parsed_html.find(id='ctl00_Contenido_tblInmueble') + cadasters = [] if description is None: - print("Multiparcela found!") + logger.info("Multiparcela found!") ''' Multiparcela with multiple cadasters ''' - cadasters = [] + all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')}) - print("->Parcelas found: {}".format(len(all_cadasters))) + logger.info("->Parcelas found: {}".format(len(all_cadasters))) for partial_cadaster in all_cadasters: partial_cadaster_ref = partial_cadaster.find("b") - print("-->Partial cadaster: {}".format(partial_cadaster_ref.text)) + logger.info("-->Partial cadaster: {}".format(partial_cadaster_ref.text)) partial_cadaster_text = partial_cadaster_ref.text.strip() - cadaster = CadastroScrapper.scrap_cadaster_full_code(partial_cadaster_text, x, y) + cadaster = CadastroScrapper.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y) cadasters.append(cadaster) - return cadasters else: - return CadastroScrapper.parse_html_parcela(parsed_html, x, y) + cadaster = CadastroScrapper.parse_html_parcela(parsed_html, x, y) + cadasters.append(cadaster) + + return cadasters @staticmethod def scrap_coord(x, y): - print("====Longitude: {} Latitude: {}====".format(x, y)) + logger.info("====Longitude: {} Latitude: {}====".format(x, y)) url = URL.format(x, y) - print("-->URL for coordinates: {}".format(url)) + logger.info("-->URL for coordinates: {}".format(url)) f = urlopen(url) data = f.read() root = ElementTree.fromstring(data) @@ -167,7 +225,7 @@ class CadastroScrapper: if pc1 is None or pc2 is None: return None else: - print("-->FOUND!") + logger.info("-->FOUND!") cadaster = ''.join([pc1.text,pc2.text]) return CadastroScrapper.scrap_cadaster(cadaster, x, y) diff --git a/src/librecatastro/domain/address.py b/src/librecatastro/domain/address.py index 26e41ba..c7d5b58 100644 --- a/src/librecatastro/domain/address.py +++ b/src/librecatastro/domain/address.py @@ -3,12 +3,16 @@ import re from src.settings import config +from src.utils.cadastro_logger import CadastroLogger + +logger = CadastroLogger(__name__).logger + class Address: def __init__(self, address): self.full_address = address - print("Full address: {}", self.full_address) - print("Separator: {}", config['separator']) + logger.info("Full address: {}".format(self.full_address)) + logger.info("Separator: {}".format(config['separator'])) self.first_line = None self.second_line = None self.street = None @@ -17,10 +21,24 @@ class Address: self.province_parentheses = None self.province = None + self.doorway = None + self.floor = None + self.door = None + + self.site = None + self.lot = None + self.first_line = self.get_first_line() self.second_line = self.get_second_line() self.street = self.get_street() + self.doorway = self.get_doorway() + self.floor = self.get_floor() + self.door = self.get_door() + + self.site = self.get_site() + self.lot = self.get_lot() + self.cp = self.get_cp() self.province_parentheses, self.province = self.get_province() self.city = self.get_city() @@ -45,6 +63,66 @@ class Address: def get_street(self): return self.get_first_line() + def get_doorway(self): + if self.doorway is not None: + return self.doorway + + doorway_text = None + doorway = re.search(r'Es:([-a-zA-Z0-9]+)', self.get_first_line()) + + if doorway: + doorway_text = doorway.group(1) + + return doorway_text + + def get_door(self): + if self.door is not None: + return self.door + + door_text = None + door = re.search(r'Pt:([-a-zA-Z0-9]+)', self.get_first_line()) + + if door: + door_text = door.group(1) + + return door_text + + def get_floor(self): + if self.floor is not None: + return self.floor + + floor_text = None + floor = re.search(r'Pl:([-a-zA-Z0-9]+)', self.get_first_line()) + + if floor: + floor_text = floor.group(1) + + return floor_text + + def get_site(self): + if self.site is not None: + return self.site + + site_text = None + site = re.search(r'Polígono ([-a-zA-Z0-9]+)', self.get_first_line()) + + if site: + site_text = site.group(1) + + return site_text + + def get_lot(self): + if self.lot is not None: + return self.lot + + lot_text = None + lot = re.search(r'Parcela ([-a-zA-Z0-9]+)', self.get_first_line()) + + if lot: + lot_text = lot.group(1) + + return lot_text + def get_cp(self): if self.cp is not None: return self.cp diff --git a/src/librecatastro/domain/cadaster.py b/src/librecatastro/domain/cadaster.py deleted file mode 100644 index 8267f55..0000000 --- a/src/librecatastro/domain/cadaster.py +++ /dev/null @@ -1,37 +0,0 @@ -import json - -from datetime import datetime -from elasticsearch import Elasticsearch - -from src.librecatastro.domain.address import Address -from src.librecatastro.domain.location import Location -from src.settings import config - - -class Cadaster: - def __init__(self, dict): - self.address = Address(dict[u'Localización']) - self.cadaster = dict[u'Referencia catastral'] - self.type = dict[u'Clase'] if u'Clase' in dict else None - self.use = dict[u'Uso principal'] if u'Uso principal' in dict else None - self.surface = dict[u'Superficie construida'] if u'Superficie construida' in dict else None - self.year = dict[u'Año construcción'] if u'Año construcción' in dict else None - self.location = Location(dict[u'Longitud'], dict[u'Latitud']) if u'Longitud' in dict and u'Latitud' in dict else None - self.timestamp = str(datetime.now()) - - def to_json(self): - return json.dumps(self, default=lambda o: o.__dict__, - sort_keys=True, indent=4) - - def to_elasticsearch(self): - es = Elasticsearch() - res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json()) - print(res) - return res - - def from_elasticsearch(self): - es = Elasticsearch() - query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}' - res = es.search(index=config['elasticsearch-index'], body=query) - print(res) - return res diff --git a/src/librecatastro/domain/cadaster_entry.py b/src/librecatastro/domain/cadaster_entry.py new file mode 100644 index 0000000..2ee3d2e --- /dev/null +++ b/src/librecatastro/domain/cadaster_entry.py @@ -0,0 +1,42 @@ +import json + +from datetime import datetime +from elasticsearch import Elasticsearch + +from src.librecatastro.domain.address import Address +from src.librecatastro.domain.location import Location +from src.settings import config +from src.utils.cadastro_logger import CadastroLogger + +logger = CadastroLogger(__name__).logger + + +class CadasterEntry: + """Cadaster class, that stores all the information about a surface and its properties""" + + def __init__(self, description_data): + self.address = Address(description_data[u'Localización']) + self.cadaster = description_data[u'Referencia catastral'] + self.type = description_data[u'Clase'] if u'Clase' in description_data else None + self.use = description_data[u'Uso principal'] if u'Uso principal' in description_data else None + self.surface = description_data[u'Superficie construida'] if u'Superficie construida' in description_data else None + self.year = description_data[u'Año construcción'] if u'Año construcción' in description_data else None + self.location = Location(description_data[u'Longitud'], description_data[u'Latitud']) if u'Longitud' in description_data and u'Latitud' in description_data else None + self.timestamp = str(datetime.now()) + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) + + def to_elasticsearch(self): + es = Elasticsearch() + res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json()) + logger.info(res) + return res + + def from_elasticsearch(self): + es = Elasticsearch() + query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}' + res = es.search(index=config['elasticsearch-index'], body=query) + logger.info(res) + return res diff --git a/src/logger.cfg b/src/logger.cfg new file mode 100644 index 0000000..3068694 --- /dev/null +++ b/src/logger.cfg @@ -0,0 +1,33 @@ +[loggers] +keys=root,sampleLogger + +[handlers] +keys=consoleHandler,fileHandler + +[formatters] +keys=sampleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler,fileHandler + +[logger_sampleLogger] +level=INFO +handlers=consoleHandler +qualname=sampleLogger +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=sampleFormatter +args=(sys.stdout,) + +[handler_fileHandler] +class=FileHandler +level=INFO +formatter=sampleFormatter +args=('%(logfilename)s', 'a', 'utf-8', False) + +[formatter_sampleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s \ No newline at end of file diff --git a/src/logs/__init__.py b/src/logs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/settings.py b/src/settings.py index bf8d785..0e75395 100644 --- a/src/settings.py +++ b/src/settings.py @@ -1,4 +1,10 @@ +import os + +root_path = os.path.dirname(os.path.abspath(__file__)) + config = { "separator": "####", - "elasticsearch-index": "cadaster" + "elasticsearch-index": "cadaster", + "log_config": os.path.join(root_path, 'logger.cfg'), + "log": os.path.join(root_path, 'logs', 'log') } \ No newline at end of file diff --git a/src/templates/ontology.owl b/src/templates/ontology.owl index b20b66f..f2c5dd0 100644 --- a/src/templates/ontology.owl +++ b/src/templates/ontology.owl @@ -24,9 +24,9 @@ Thing - + - Cadaster + CadasterEntry diff --git a/src/tests/scrapper_tests.py b/src/tests/scrapper_tests.py index 2ffb865..f9c8d70 100644 --- a/src/tests/scrapper_tests.py +++ b/src/tests/scrapper_tests.py @@ -2,7 +2,6 @@ import unittest from src.librecatastro.catastro_scrapper import CadastroScrapper from src.utils.elasticsearch_utils import ElasticSearchUtils -from src.utils.ontology_converter import OntologyConverter class MyTestCase(unittest.TestCase): @@ -19,11 +18,20 @@ class MyTestCase(unittest.TestCase): cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47) self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK') + def test_coordinate_multiparcela_creates_cadaster_2(self): + cadaster = CadastroScrapper.scrap_coord(-0.33, 39.47) + self.assertTrue(len(cadaster) > 0) + def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self): cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47) cadaster.to_elasticsearch() self.assertIsNotNone(cadaster.from_elasticsearch()) + def test_cadaster_site_lot_creates_cadaster_and_sets_site_lot(self): + cadaster = CadastroScrapper.scrap_cadaster('45134A02500003') + self.assertEqual(cadaster.address.site, '25') + self.assertEqual(cadaster.address.lot, '3') + def test_cadaster_full_creates_cadaster(self): cadaster = CadastroScrapper.scrap_cadaster('0083101WK2008S0001PD') self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO') @@ -53,30 +61,37 @@ class MyTestCase(unittest.TestCase): self.assertIsNotNone(cadaster.from_elasticsearch()) def scrap_random_until_x_times_found(self, times): - cadaster_list = CadastroScrapper.scrap_results_random(times) + cadaster_list = CadastroScrapper.scrap_results_random_x_times(times) self.assertEqual(len(cadaster_list), times) return cadaster_list def test_scrap_random_until_5_found(self): self.scrap_random_until_x_times_found(5) - def test_scrap_random_until_5_is_stores_in_elasticsearch(self): + def test_scrap_random_until_5_is_stored_in_elasticsearch(self): cadaster_list = self.scrap_random_until_x_times_found(5) for cadaster in cadaster_list: cadaster.to_elasticsearch() self.assertIsNotNone(cadaster.from_elasticsearch()) - def test_scrap_random_until_1_is_stores_in_elasticsearch(self): + def test_scrap_random_until_100_is_stored_in_elasticsearch(self): + cadaster_list = self.scrap_random_until_x_times_found(100) + for cadaster in cadaster_list: + cadaster.to_elasticsearch() + self.assertIsNotNone(cadaster.from_elasticsearch()) + + def test_scrap_random_until_1_is_stored_in_elasticsearch(self): cadaster_list = self.scrap_random_until_x_times_found(1) for cadaster in cadaster_list: cadaster.to_elasticsearch() self.assertIsNotNone(cadaster.from_elasticsearch()) - def test_create_ontology_with_one_scrap_result(self): + """def test_create_ontology_with_one_scrap_result(self): ontology_converter = OntologyConverter() results = list() results.append(CadastroScrapper.scrap_coord(-3.68, 40.47)) print(ontology_converter.cadastro_dict_to_ontology(results)) + """ if __name__ == '__main__': diff --git a/src/utils/cadastro_logger.py b/src/utils/cadastro_logger.py new file mode 100644 index 0000000..0cada0a --- /dev/null +++ b/src/utils/cadastro_logger.py @@ -0,0 +1,21 @@ +import logging +import logging.config +from logging.handlers import RotatingFileHandler + +from src.settings import config + + +class CadastroLogger: + """Custom logger for keeping track of the Catastro Scrapping""" + + def __init__(self, class_name): + logging.config.fileConfig(fname=config['log_config'], defaults={'logfilename': config['log']}, disable_existing_loggers=False) + + self.logger = logging.getLogger(class_name) + + my_handler = RotatingFileHandler(config['log'], mode='a', maxBytes=5 * 1024 * 1024, + backupCount=100, encoding='utf-8', delay=0) + + self.logger.addHandler(my_handler) + pass + diff --git a/src/utils/elasticsearch_utils.py b/src/utils/elasticsearch_utils.py index 9f0aa32..83551b2 100644 --- a/src/utils/elasticsearch_utils.py +++ b/src/utils/elasticsearch_utils.py @@ -1,7 +1,13 @@ from elasticsearch import Elasticsearch +from src.utils.cadastro_logger import CadastroLogger + +logger = CadastroLogger(__name__).logger + class ElasticSearchUtils: + """Custom class for managing Elastic Search queries""" + def __init__(self): pass @@ -25,11 +31,13 @@ class ElasticSearchUtils: } } } - print("Creating 'cadaster' index...") - es.indices.create(index='cadaster', body=request_body) + logger.info("Creating 'cadaster' index...") + res = es.indices.create(index='cadaster', body=request_body) + logger.info(res) @staticmethod def remove_index(): es = Elasticsearch() + logger.info("Deleting 'cadaster' index...") res = es.indices.delete(index='cadaster', ignore=[400, 404]) - print(res) \ No newline at end of file + logger.info(res) diff --git a/src/utils/list_utils.py b/src/utils/list_utils.py new file mode 100644 index 0000000..483fecb --- /dev/null +++ b/src/utils/list_utils.py @@ -0,0 +1,8 @@ +class ListUtils: + """ Different functions for make working with lists easier""" + def __init__(self): + pass + + @staticmethod + def flat(non_flat_list): + return [item for sublist in non_flat_list for item in sublist] diff --git a/src/utils/ontology_converter.py b/src/utils/ontology_converter.py index 7a7503f..95128d7 100644 --- a/src/utils/ontology_converter.py +++ b/src/utils/ontology_converter.py @@ -69,7 +69,7 @@ class OntologyConverter: individuals = ''.join([individuals, province_txt, city_txt, address_txt]) - print(individuals) + #print(individuals) return individuals