Adds logger, utils, tests, new scrapping by time, fixes multiparcela errors.
This commit is contained in:
parent
b7a778d55c
commit
103c87778b
|
@ -1,48 +1,90 @@
|
|||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
from time import sleep
|
||||
from urllib.request import urlopen
|
||||
from xml.etree import ElementTree
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.librecatastro.domain.cadaster import Cadaster
|
||||
from src.librecatastro.domain.cadaster_entry import CadasterEntry
|
||||
from src.settings import config
|
||||
from src.utils.ontology_converter import OntologyConverter
|
||||
|
||||
'''Constants'''
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
from src.utils.list_utils import ListUtils
|
||||
|
||||
"""Constants"""
|
||||
|
||||
'''Spain geocoordinates'''
|
||||
LONGITUDE = (4289603, -18024300) # *1000000
|
||||
LATITUDE = (43769200, 27725500) # *1000000
|
||||
|
||||
'''Scale for scrapping'''
|
||||
SCALE = 1000000
|
||||
TRUNCATE_RIGHT = 4
|
||||
|
||||
'''Enumerator for tuple access'''
|
||||
MAX = 0
|
||||
MIN = 1
|
||||
|
||||
'''Catastro web services parametrized'''
|
||||
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
|
||||
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
||||
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
||||
|
||||
'''Information to scrap from HTML'''
|
||||
field_names = (u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción')
|
||||
|
||||
'''Logger'''
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class CadastroScrapper:
|
||||
"""Scrapper class for Cadastro Web"""
|
||||
"""Scrapper class for Catastro HTML"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
""" Scrapping main calls """
|
||||
@staticmethod
|
||||
def scrap_all():
|
||||
results = []
|
||||
for j in range(LONGITUDE[MIN], LONGITUDE[MAX]):
|
||||
for i in range(LATITUDE[MIN], LATITUDE[MAX]):
|
||||
CadastroScrapper.scrap_coord(i, j)
|
||||
result = CadastroScrapper.scrap_coord(i, j)
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_linear(times):
|
||||
def scrap_results_by_time(seconds):
|
||||
start_time = time.time()
|
||||
results = []
|
||||
|
||||
finished = False
|
||||
for j in range(LONGITUDE[MIN], LONGITUDE[MAX]):
|
||||
for i in range(LATITUDE[MIN], LATITUDE[MAX]):
|
||||
if finished:
|
||||
break
|
||||
result = CadastroScrapper.scrap_coord(i, j)
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
now = time.time()
|
||||
elapsed_time = now - start_time
|
||||
if elapsed_time > seconds:
|
||||
finished = True
|
||||
break
|
||||
sleep(5)
|
||||
if finished:
|
||||
break
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_linear_x_times(times):
|
||||
results = []
|
||||
counter = times
|
||||
|
||||
finished = False
|
||||
for x in range(LONGITUDE[MIN], LONGITUDE[MAX]):
|
||||
for y in range(LATITUDE[MIN], LATITUDE[MAX]):
|
||||
|
||||
|
@ -55,11 +97,17 @@ class CadastroScrapper:
|
|||
results.append(result)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
return
|
||||
finished = True
|
||||
break
|
||||
sleep(5)
|
||||
|
||||
if finished:
|
||||
break
|
||||
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_random(times):
|
||||
def scrap_results_random_x_times(times):
|
||||
results = []
|
||||
counter = times
|
||||
while counter > 0:
|
||||
|
@ -81,11 +129,12 @@ class CadastroScrapper:
|
|||
|
||||
#ontology_converter = OntologyConverter()
|
||||
#print(ontology_converter.cadastro_dict_to_ontology(results))
|
||||
print("====PROCESSING FINISHED====")
|
||||
print("Results found: {}".format(times))
|
||||
print(results)
|
||||
return results
|
||||
logger.info("====PROCESSING FINISHED====")
|
||||
logger.info("Results found: {}".format(times))
|
||||
logger.info(results)
|
||||
return ListUtils.flat(results)
|
||||
|
||||
""" Scrapping secondary calls """
|
||||
@staticmethod
|
||||
def parse_html_parcela(parsed_html, x=None, y=None):
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
@ -105,21 +154,16 @@ class CadastroScrapper:
|
|||
descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0]
|
||||
descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0]
|
||||
elif field_header.text == u'Localización':
|
||||
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',
|
||||
config[
|
||||
'separator']).replace(
|
||||
'<br>', config['separator'])
|
||||
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',config['separator']).replace('<br>', config['separator'])
|
||||
|
||||
cadaster_entry = Cadaster(descriptive_data)
|
||||
print(cadaster_entry.to_json())
|
||||
cadaster_entry = CadasterEntry(descriptive_data)
|
||||
logger.info(cadaster_entry.to_json())
|
||||
return cadaster_entry
|
||||
|
||||
@staticmethod
|
||||
def scrap_cadaster_full_code(full_cadaster, x=None, y=None):
|
||||
delimitacion = full_cadaster[0:2]
|
||||
municipio = full_cadaster[2:5]
|
||||
def scrap_cadaster_full_code(full_cadaster, delimitacion, municipio, x=None, y=None):
|
||||
url_ref = URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||
print("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
logger.info("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
|
@ -131,34 +175,48 @@ class CadastroScrapper:
|
|||
rc_1 = cadaster[0:7]
|
||||
rc_2 = cadaster[7:14]
|
||||
url_ref = URL_REF.format(rc_1, rc_2)
|
||||
print("-->URL for cadastral data: {}".format(url_ref))
|
||||
logger.info("-->URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
|
||||
delimitacion = ''
|
||||
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
||||
if delimitacion_search:
|
||||
delimitacion = delimitacion_search.group(1)
|
||||
|
||||
municipio = ''
|
||||
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
||||
if municipio_search:
|
||||
municipio = municipio_search.group(1)
|
||||
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
cadasters = []
|
||||
if description is None:
|
||||
print("Multiparcela found!")
|
||||
logger.info("Multiparcela found!")
|
||||
''' Multiparcela with multiple cadasters '''
|
||||
cadasters = []
|
||||
|
||||
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
||||
print("->Parcelas found: {}".format(len(all_cadasters)))
|
||||
logger.info("->Parcelas found: {}".format(len(all_cadasters)))
|
||||
for partial_cadaster in all_cadasters:
|
||||
partial_cadaster_ref = partial_cadaster.find("b")
|
||||
print("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
logger.info("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||
cadaster = CadastroScrapper.scrap_cadaster_full_code(partial_cadaster_text, x, y)
|
||||
cadaster = CadastroScrapper.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio, x, y)
|
||||
cadasters.append(cadaster)
|
||||
return cadasters
|
||||
else:
|
||||
return CadastroScrapper.parse_html_parcela(parsed_html, x, y)
|
||||
cadaster = CadastroScrapper.parse_html_parcela(parsed_html, x, y)
|
||||
cadasters.append(cadaster)
|
||||
|
||||
return cadasters
|
||||
|
||||
@staticmethod
|
||||
def scrap_coord(x, y):
|
||||
print("====Longitude: {} Latitude: {}====".format(x, y))
|
||||
logger.info("====Longitude: {} Latitude: {}====".format(x, y))
|
||||
url = URL.format(x, y)
|
||||
print("-->URL for coordinates: {}".format(url))
|
||||
logger.info("-->URL for coordinates: {}".format(url))
|
||||
f = urlopen(url)
|
||||
data = f.read()
|
||||
root = ElementTree.fromstring(data)
|
||||
|
@ -167,7 +225,7 @@ class CadastroScrapper:
|
|||
if pc1 is None or pc2 is None:
|
||||
return None
|
||||
else:
|
||||
print("-->FOUND!")
|
||||
logger.info("-->FOUND!")
|
||||
cadaster = ''.join([pc1.text,pc2.text])
|
||||
|
||||
return CadastroScrapper.scrap_cadaster(cadaster, x, y)
|
||||
|
|
|
@ -3,12 +3,16 @@ import re
|
|||
|
||||
from src.settings import config
|
||||
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class Address:
|
||||
def __init__(self, address):
|
||||
self.full_address = address
|
||||
print("Full address: {}", self.full_address)
|
||||
print("Separator: {}", config['separator'])
|
||||
logger.info("Full address: {}".format(self.full_address))
|
||||
logger.info("Separator: {}".format(config['separator']))
|
||||
self.first_line = None
|
||||
self.second_line = None
|
||||
self.street = None
|
||||
|
@ -17,10 +21,24 @@ class Address:
|
|||
self.province_parentheses = None
|
||||
self.province = None
|
||||
|
||||
self.doorway = None
|
||||
self.floor = None
|
||||
self.door = None
|
||||
|
||||
self.site = None
|
||||
self.lot = None
|
||||
|
||||
self.first_line = self.get_first_line()
|
||||
self.second_line = self.get_second_line()
|
||||
|
||||
self.street = self.get_street()
|
||||
self.doorway = self.get_doorway()
|
||||
self.floor = self.get_floor()
|
||||
self.door = self.get_door()
|
||||
|
||||
self.site = self.get_site()
|
||||
self.lot = self.get_lot()
|
||||
|
||||
self.cp = self.get_cp()
|
||||
self.province_parentheses, self.province = self.get_province()
|
||||
self.city = self.get_city()
|
||||
|
@ -45,6 +63,66 @@ class Address:
|
|||
def get_street(self):
|
||||
return self.get_first_line()
|
||||
|
||||
def get_doorway(self):
|
||||
if self.doorway is not None:
|
||||
return self.doorway
|
||||
|
||||
doorway_text = None
|
||||
doorway = re.search(r'Es:([-a-zA-Z0-9]+)', self.get_first_line())
|
||||
|
||||
if doorway:
|
||||
doorway_text = doorway.group(1)
|
||||
|
||||
return doorway_text
|
||||
|
||||
def get_door(self):
|
||||
if self.door is not None:
|
||||
return self.door
|
||||
|
||||
door_text = None
|
||||
door = re.search(r'Pt:([-a-zA-Z0-9]+)', self.get_first_line())
|
||||
|
||||
if door:
|
||||
door_text = door.group(1)
|
||||
|
||||
return door_text
|
||||
|
||||
def get_floor(self):
|
||||
if self.floor is not None:
|
||||
return self.floor
|
||||
|
||||
floor_text = None
|
||||
floor = re.search(r'Pl:([-a-zA-Z0-9]+)', self.get_first_line())
|
||||
|
||||
if floor:
|
||||
floor_text = floor.group(1)
|
||||
|
||||
return floor_text
|
||||
|
||||
def get_site(self):
|
||||
if self.site is not None:
|
||||
return self.site
|
||||
|
||||
site_text = None
|
||||
site = re.search(r'Polígono ([-a-zA-Z0-9]+)', self.get_first_line())
|
||||
|
||||
if site:
|
||||
site_text = site.group(1)
|
||||
|
||||
return site_text
|
||||
|
||||
def get_lot(self):
|
||||
if self.lot is not None:
|
||||
return self.lot
|
||||
|
||||
lot_text = None
|
||||
lot = re.search(r'Parcela ([-a-zA-Z0-9]+)', self.get_first_line())
|
||||
|
||||
if lot:
|
||||
lot_text = lot.group(1)
|
||||
|
||||
return lot_text
|
||||
|
||||
def get_cp(self):
|
||||
if self.cp is not None:
|
||||
return self.cp
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.librecatastro.domain.address import Address
|
||||
from src.librecatastro.domain.location import Location
|
||||
from src.settings import config
|
||||
|
||||
|
||||
class Cadaster:
|
||||
def __init__(self, dict):
|
||||
self.address = Address(dict[u'Localización'])
|
||||
self.cadaster = dict[u'Referencia catastral']
|
||||
self.type = dict[u'Clase'] if u'Clase' in dict else None
|
||||
self.use = dict[u'Uso principal'] if u'Uso principal' in dict else None
|
||||
self.surface = dict[u'Superficie construida'] if u'Superficie construida' in dict else None
|
||||
self.year = dict[u'Año construcción'] if u'Año construcción' in dict else None
|
||||
self.location = Location(dict[u'Longitud'], dict[u'Latitud']) if u'Longitud' in dict and u'Latitud' in dict else None
|
||||
self.timestamp = str(datetime.now())
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=lambda o: o.__dict__,
|
||||
sort_keys=True, indent=4)
|
||||
|
||||
def to_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json())
|
||||
print(res)
|
||||
return res
|
||||
|
||||
def from_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||
print(res)
|
||||
return res
|
|
@ -0,0 +1,42 @@
|
|||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.librecatastro.domain.address import Address
|
||||
from src.librecatastro.domain.location import Location
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class CadasterEntry:
|
||||
"""Cadaster class, that stores all the information about a surface and its properties"""
|
||||
|
||||
def __init__(self, description_data):
|
||||
self.address = Address(description_data[u'Localización'])
|
||||
self.cadaster = description_data[u'Referencia catastral']
|
||||
self.type = description_data[u'Clase'] if u'Clase' in description_data else None
|
||||
self.use = description_data[u'Uso principal'] if u'Uso principal' in description_data else None
|
||||
self.surface = description_data[u'Superficie construida'] if u'Superficie construida' in description_data else None
|
||||
self.year = description_data[u'Año construcción'] if u'Año construcción' in description_data else None
|
||||
self.location = Location(description_data[u'Longitud'], description_data[u'Latitud']) if u'Longitud' in description_data and u'Latitud' in description_data else None
|
||||
self.timestamp = str(datetime.now())
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=lambda o: o.__dict__,
|
||||
sort_keys=True, indent=4)
|
||||
|
||||
def to_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json())
|
||||
logger.info(res)
|
||||
return res
|
||||
|
||||
def from_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||
logger.info(res)
|
||||
return res
|
|
@ -0,0 +1,33 @@
|
|||
[loggers]
|
||||
keys=root,sampleLogger
|
||||
|
||||
[handlers]
|
||||
keys=consoleHandler,fileHandler
|
||||
|
||||
[formatters]
|
||||
keys=sampleFormatter
|
||||
|
||||
[logger_root]
|
||||
level=INFO
|
||||
handlers=consoleHandler,fileHandler
|
||||
|
||||
[logger_sampleLogger]
|
||||
level=INFO
|
||||
handlers=consoleHandler
|
||||
qualname=sampleLogger
|
||||
propagate=0
|
||||
|
||||
[handler_consoleHandler]
|
||||
class=StreamHandler
|
||||
level=INFO
|
||||
formatter=sampleFormatter
|
||||
args=(sys.stdout,)
|
||||
|
||||
[handler_fileHandler]
|
||||
class=FileHandler
|
||||
level=INFO
|
||||
formatter=sampleFormatter
|
||||
args=('%(logfilename)s', 'a', 'utf-8', False)
|
||||
|
||||
[formatter_sampleFormatter]
|
||||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
|
@ -1,4 +1,10 @@
|
|||
import os
|
||||
|
||||
root_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
config = {
|
||||
"separator": "####",
|
||||
"elasticsearch-index": "cadaster"
|
||||
"elasticsearch-index": "cadaster",
|
||||
"log_config": os.path.join(root_path, 'logger.cfg'),
|
||||
"log": os.path.join(root_path, 'logs', 'log')
|
||||
}
|
|
@ -24,9 +24,9 @@
|
|||
<rdfs:label>Thing</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Cadaster">
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/CadasterEntry">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>Cadaster</rdfs:label>
|
||||
<rdfs:label>CadasterEntry</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Address">
|
||||
|
|
|
@ -2,7 +2,6 @@ import unittest
|
|||
|
||||
from src.librecatastro.catastro_scrapper import CadastroScrapper
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
from src.utils.ontology_converter import OntologyConverter
|
||||
|
||||
|
||||
class MyTestCase(unittest.TestCase):
|
||||
|
@ -19,11 +18,20 @@ class MyTestCase(unittest.TestCase):
|
|||
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
|
||||
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
|
||||
|
||||
def test_coordinate_multiparcela_creates_cadaster_2(self):
|
||||
cadaster = CadastroScrapper.scrap_coord(-0.33, 39.47)
|
||||
self.assertTrue(len(cadaster) > 0)
|
||||
|
||||
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
|
||||
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_cadaster_site_lot_creates_cadaster_and_sets_site_lot(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('45134A02500003')
|
||||
self.assertEqual(cadaster.address.site, '25')
|
||||
self.assertEqual(cadaster.address.lot, '3')
|
||||
|
||||
def test_cadaster_full_creates_cadaster(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('0083101WK2008S0001PD')
|
||||
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
|
||||
|
@ -53,30 +61,37 @@ class MyTestCase(unittest.TestCase):
|
|||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def scrap_random_until_x_times_found(self, times):
|
||||
cadaster_list = CadastroScrapper.scrap_results_random(times)
|
||||
cadaster_list = CadastroScrapper.scrap_results_random_x_times(times)
|
||||
self.assertEqual(len(cadaster_list), times)
|
||||
return cadaster_list
|
||||
|
||||
def test_scrap_random_until_5_found(self):
|
||||
self.scrap_random_until_x_times_found(5)
|
||||
|
||||
def test_scrap_random_until_5_is_stores_in_elasticsearch(self):
|
||||
def test_scrap_random_until_5_is_stored_in_elasticsearch(self):
|
||||
cadaster_list = self.scrap_random_until_x_times_found(5)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_scrap_random_until_1_is_stores_in_elasticsearch(self):
|
||||
def test_scrap_random_until_100_is_stored_in_elasticsearch(self):
|
||||
cadaster_list = self.scrap_random_until_x_times_found(100)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_scrap_random_until_1_is_stored_in_elasticsearch(self):
|
||||
cadaster_list = self.scrap_random_until_x_times_found(1)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_create_ontology_with_one_scrap_result(self):
|
||||
"""def test_create_ontology_with_one_scrap_result(self):
|
||||
ontology_converter = OntologyConverter()
|
||||
results = list()
|
||||
results.append(CadastroScrapper.scrap_coord(-3.68, 40.47))
|
||||
print(ontology_converter.cadastro_dict_to_ontology(results))
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
import logging
|
||||
import logging.config
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
from src.settings import config
|
||||
|
||||
|
||||
class CadastroLogger:
|
||||
"""Custom logger for keeping track of the Catastro Scrapping"""
|
||||
|
||||
def __init__(self, class_name):
|
||||
logging.config.fileConfig(fname=config['log_config'], defaults={'logfilename': config['log']}, disable_existing_loggers=False)
|
||||
|
||||
self.logger = logging.getLogger(class_name)
|
||||
|
||||
my_handler = RotatingFileHandler(config['log'], mode='a', maxBytes=5 * 1024 * 1024,
|
||||
backupCount=100, encoding='utf-8', delay=0)
|
||||
|
||||
self.logger.addHandler(my_handler)
|
||||
pass
|
||||
|
|
@ -1,7 +1,13 @@
|
|||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ElasticSearchUtils:
|
||||
"""Custom class for managing Elastic Search queries"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
@ -25,11 +31,13 @@ class ElasticSearchUtils:
|
|||
}
|
||||
}
|
||||
}
|
||||
print("Creating 'cadaster' index...")
|
||||
es.indices.create(index='cadaster', body=request_body)
|
||||
logger.info("Creating 'cadaster' index...")
|
||||
res = es.indices.create(index='cadaster', body=request_body)
|
||||
logger.info(res)
|
||||
|
||||
@staticmethod
|
||||
def remove_index():
|
||||
es = Elasticsearch()
|
||||
logger.info("Deleting 'cadaster' index...")
|
||||
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
||||
print(res)
|
||||
logger.info(res)
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
class ListUtils:
|
||||
""" Different functions for make working with lists easier"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def flat(non_flat_list):
|
||||
return [item for sublist in non_flat_list for item in sublist]
|
|
@ -69,7 +69,7 @@ class OntologyConverter:
|
|||
|
||||
individuals = ''.join([individuals, province_txt, city_txt, address_txt])
|
||||
|
||||
print(individuals)
|
||||
#print(individuals)
|
||||
return individuals
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue