Adds a JsonEncoder class to recursively manage serialization for ES. Changes scrapping function to return lists (because of multiparcelas).

This commit is contained in:
J 2019-09-16 12:41:52 +02:00
parent 103c87778b
commit 7aa3e17912
6 changed files with 64 additions and 27 deletions

View File

@ -141,6 +141,7 @@ class CadastroScrapper:
descriptive_data = dict()
descriptive_data[u'Longitud'] = x
descriptive_data[u'Latitud'] = y
''' Parcela '''
fields = description.find_all('div')
for field in fields:
@ -157,7 +158,6 @@ class CadastroScrapper:
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',config['separator']).replace('<br>', config['separator'])
cadaster_entry = CadasterEntry(descriptive_data)
logger.info(cadaster_entry.to_json())
return cadaster_entry
@staticmethod

View File

@ -1,4 +1,3 @@
import json
import re
from src.settings import config
@ -9,10 +8,11 @@ logger = CadastroLogger(__name__).logger
class Address:
""" Domain class for storing Address in Catastro format"""
def __init__(self, address):
self.full_address = address
logger.info("Full address: {}".format(self.full_address))
logger.info("Separator: {}".format(config['separator']))
''' Initialization in case some data is not present'''
self.first_line = None
self.second_line = None
self.street = None
@ -28,6 +28,7 @@ class Address:
self.site = None
self.lot = None
''' NLP search '''
self.first_line = self.get_first_line()
self.second_line = self.get_second_line()
@ -159,5 +160,4 @@ class Address:
return province_parentheses_text, province_text
def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
return dict(full_address=self.full_address, first_line=self.first_line, second_line=self.second_line, street=self.street, cp=self.cp, city=self.city, province_parantheses=self.province_parentheses, province=self.province, doorway=self.doorway, floor=self.floor, door=self.door, site=self.site, lot=self.lot)

View File

@ -7,6 +7,7 @@ from src.librecatastro.domain.address import Address
from src.librecatastro.domain.location import Location
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.json_enconder import JSONEncoder
logger = CadastroLogger(__name__).logger
@ -21,16 +22,18 @@ class CadasterEntry:
self.use = description_data[u'Uso principal'] if u'Uso principal' in description_data else None
self.surface = description_data[u'Superficie construida'] if u'Superficie construida' in description_data else None
self.year = description_data[u'Año construcción'] if u'Año construcción' in description_data else None
self.location = Location(description_data[u'Longitud'], description_data[u'Latitud']) if u'Longitud' in description_data and u'Latitud' in description_data else None
self.location = Location(description_data[u'Longitud'], description_data[u'Latitud'])
self.timestamp = str(datetime.now())
def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
return dict(address=self.address, cadaster=self.cadaster, type=self.type, use=self.use, surface=self.surface, year=self.year, location=self.location, timestamp=self.timestamp)
def to_elasticsearch(self):
es = Elasticsearch()
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json())
body = json.dumps(self.to_json(), cls=JSONEncoder,sort_keys=True,
indent=4, separators=(',', ': '))
logger.info("Sending to Elastic Search\n:{}".format(body))
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
logger.info(res)
return res

View File

@ -1,8 +1,17 @@
import json
from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger
class Location:
def __init__(self, longitude, latitude):
self.lon = longitude
self.lat = latitude
self.lon = float(longitude) if longitude is not None else None
self.lat = float(latitude) if latitude is not None else None
def to_json(self):
return "{'location': {'lon': {}, 'lat': {}}".format(float(self.lon) if self.lon is not None else None,
float(self.lat) if self.lat is not None else None)
if self.lon is None and self.lat is None:
return None
else:
return dict(lon=self.lon, lat=self.lat)

View File

@ -15,54 +15,70 @@ class MyTestCase(unittest.TestCase):
assert True
def test_coordinate_creates_cadaster(self):
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
cadaster_list = CadastroScrapper.scrap_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
def test_coordinate_multiparcela_creates_cadaster_2(self):
cadaster = CadastroScrapper.scrap_coord(-0.33, 39.47)
self.assertTrue(len(cadaster) > 0)
cadaster_list = CadastroScrapper.scrap_coord(-0.33, 39.47)
self.assertTrue(len(cadaster_list) > 1)
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
cadaster_list = CadastroScrapper.scrap_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_cadaster_site_lot_creates_cadaster_and_sets_site_lot(self):
cadaster = CadastroScrapper.scrap_cadaster('45134A02500003')
cadaster_list = CadastroScrapper.scrap_cadaster('45134A02500003')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.site, '25')
self.assertEqual(cadaster.address.lot, '3')
def test_cadaster_full_creates_cadaster(self):
cadaster = CadastroScrapper.scrap_cadaster('0083101WK2008S0001PD')
cadaster_list = CadastroScrapper.scrap_cadaster('0083101WK2008S0001PD')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
self.assertEqual(cadaster.address.province, 'CUENCA')
def test_cadaster_half_creates_cadaster(self):
cadaster = CadastroScrapper.scrap_cadaster('0183001WK2008S')
cadaster_list = CadastroScrapper.scrap_cadaster('0183001WK2008S')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
self.assertEqual(cadaster.address.province, 'CUENCA')
def test_cadaster_half_creates_cadaster_2(self):
cadaster = CadastroScrapper.scrap_cadaster('21012A03100046')
cadaster_list = CadastroScrapper.scrap_cadaster('21012A03100046')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.province, 'HUELVA')
def test_cadaster_no_cp_creates_cadaster(self):
cadaster = CadastroScrapper.scrap_cadaster('06145A00500028')
cadaster_list = CadastroScrapper.scrap_cadaster('06145A00500028')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertIsNone(cadaster.address.cp)
self.assertEqual(cadaster.address.province, 'BADAJOZ')
def test_cadaster_multiparcela_returns_list(self):
cadaster_list= CadastroScrapper.scrap_cadaster('22282A00900547')
def test_cadaster_multiparcela_returns_list_of_2(self):
cadaster_list = CadastroScrapper.scrap_cadaster('22282A00900547')
self.assertEqual(len(cadaster_list), 2)
def test_cadaster_is_stored_in_elasticsearch(self):
cadaster = CadastroScrapper.scrap_cadaster('0183001WK2008S')
cadaster_list = CadastroScrapper.scrap_cadaster('0183001WK2008S')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def scrap_random_until_x_times_found(self, times):
cadaster_list = CadastroScrapper.scrap_results_random_x_times(times)
self.assertEqual(len(cadaster_list), times)
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
def test_scrap_random_until_5_found(self):

View File

@ -0,0 +1,9 @@
import json
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if hasattr(obj, 'to_json'):
return obj.to_json()
else:
return json.JSONEncoder.default(self, obj)