Adds final documentation of most of functions and methods.

This commit is contained in:
josejuanmartinez 2019-09-28 11:20:40 +02:00
parent 6c6da34adf
commit 6a36266886
15 changed files with 282 additions and 56 deletions

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
from collections import namedtuple
@ -10,6 +13,10 @@ logger = CadastroLogger(__name__).logger
class GeoPolygon:
"""
A GeoPolygon is a series of lon,lat points in a json. This class uses shapely.geometry
to convert points into Point objects and these into a Polygon class.
"""
def __init__(self, file):
self.polygon = None
@ -27,10 +34,20 @@ class GeoPolygon:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
def is_point_in_polygon(self, lon, lat):
"""
Check if a point (lot, lat) is inside this Polygon
:param lon: longitude
:param lat: latitude
:return: True if point is inside polygon. False otherwise
"""
p = Point(lon, lat)
return self.polygon.contains(p)
def get_bounding_box(self):
"""
Gets the bounding box of a polygon
:return: A Box object from shapely.geometry containing inside the Polygon
"""
if self.polygon is not None:
return self.polygon.bounds
else:

View File

@ -21,7 +21,7 @@ logger = CadastroLogger(__name__).logger
class ParserHTML(Parser):
"""Parser class for Catastro HTML"""
"""Class that manages the processing of scrapped HTML from Cadastro webpage"""
def __init__(self):
super().__init__()
@ -35,6 +35,13 @@ class ParserHTML(Parser):
""" Processing """
@classmethod
def process_search_by_coordinates(cls, x, y, pictures=False):
"""
Searches by coordinate from HTML and processes the result.
:param x: longitude
:param y: latitude
:param pictures: True if we want house plan pictures to be scrapped
:return: List of CadasterEntry objects
"""
data = ScrapperHTML.scrap_coord(x, y)
root = ElementTree.fromstring(data)
@ -60,8 +67,15 @@ class ParserHTML(Parser):
return results
@classmethod
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', matches=None):
"""
Searches by province from HTML and processes the result.
:param prov_list: List of province names
:param start_from: Name of the city of the first province to start from
:param pictures: True if we want house plan pictures to be scrapped
:param matches: Max number of matches (for debugging purporses mainly)
:return: List of CadasterEntry objects
"""
times = 0
results = []
@ -129,7 +143,7 @@ class ParserHTML(Parser):
counter += 1
times += 1
if max_times is not None and times >= max_times:
if matches is not None and times >= matches:
return results
except urllib.error.HTTPError as e:
@ -157,6 +171,13 @@ class ParserHTML(Parser):
""" Parsing """
@classmethod
def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None):
"""
Parses an HTML and crates a CadasterEntry object
:param x: longitude obtained previously
:param y: latitude obtained previously
:param pictures: base64 picture obtained previously
:return: CadasterEntry object
"""
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
descriptive_data = dict()

View File

@ -4,9 +4,6 @@
import urllib.parse
from urllib import error
import requests
import xmltodict
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.parser import Parser
from src.librecatastro.scrapping.scrapper import Scrapper
@ -23,7 +20,7 @@ logger = CadastroLogger(__name__).logger
class ParserXML(Parser):
"""Parser class for Catastro XML"""
"""Class that manages the processing of scrapped XML from Cadastro webservices"""
def __init__(self):
super().__init__()
@ -31,11 +28,17 @@ class ParserXML(Parser):
''' Processing calls '''
@classmethod
def process_search_by_coordinates(cls, x, y, pictures=False):
"""Scraps properties by coordinates"""
"""
Searches by coordinate from XML and processes the result.
:param x: longitude
:param y: latitude
:param pictures: True if we want house plan pictures to be scrapped
:return: List of CadasterEntry objects
"""
results = []
xml_dict_map = ScrapperXML.get_coord(x, y)
xml_dict_map = ScrapperXML.scrap_coord(x, y)
pc1 = None
pc2 = None
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
@ -127,8 +130,15 @@ class ParserXML(Parser):
return results
@classmethod
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', matches=None):
"""
Searches by province from XML and processes the result.
:param prov_list: List of province names
:param start_from: Name of the city of the first province to start from
:param pictures: True if we want house plan pictures to be scrapped
:param matches: Max number of matches (for debugging purporses mainly)
:return: List of CadasterEntry objects
"""
times = 0
results = []
@ -146,14 +156,14 @@ class ParserXML(Parser):
while num_scrapping_fails > 0:
try:
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
res = cls.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
if len(res) < 1:
num_scrapping_fails -= 1
else:
num_scrapping_fails = 10
times += 1
results.append(res)
if max_times is not None and times >= max_times:
if matches is not None and times >= matches:
return ListUtils.flat(results)
except urllib.error.HTTPError as e:
@ -182,7 +192,18 @@ class ParserXML(Parser):
''' Parsing calls '''
@classmethod
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
def parse_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
"""
Parses an XML and crates a CadasterEntry object
:param numerero_map: DotMap obtained from a previous call with information about the address to parse
:param prov_name: Province Name
:param city_name: City Name
:param tv: Kind of way (Tipo de Via) - CL (calle), AV (Avenida) ...
:param nv: Street name (Nombre de via)
:param num: Street number (Numero de via)
:param pictures: True if we want to scrap also house plan pictures. False otherwise.
:return: List of CadasterEntry objects
"""
results = []
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
return results

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import base64
from time import sleep
from urllib.request import urlopen

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from time import sleep
from urllib.request import urlopen
@ -13,8 +16,9 @@ logger = CadastroLogger(__name__).logger
class ScrapperHTML(Scrapper):
"""HTML Catastro Scrapper"""
"""Class that manages the HTML scrapping from the Cadastro Official Page """
''' Some reference URLs'''
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4326&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
@ -24,6 +28,12 @@ class ScrapperHTML(Scrapper):
@classmethod
def scrap_coord(cls, x, y):
"""
Scraps HTML by coordinates
:param x: Longitude
:param y: Latitude
:return: HTML content of the cadaster entry
"""
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
url = cls.URL.format(x, y)
logger.debug("URL for coordinates: {}".format(url))
@ -33,8 +43,15 @@ class ScrapperHTML(Scrapper):
return f.read()
@classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
def scrap_cadaster_full_code(cls, full_cadaster, prov_num, city_num):
"""
Scraps HTML by cadaster full code, province (delimitacion) and city (municipio)
:param full_cadaster: Full cadaster code (>14 characters)
:param prov_num: Province number
:param city_num: City number
:return: BeautifulSoup-parsed HTML content of the cadaster entry
"""
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, prov_num, city_num)
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
@ -45,7 +62,18 @@ class ScrapperHTML(Scrapper):
return parsed_html
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
def scrap_cadaster(cls, cadaster, prov_num=None, city_num=None, pictures=False):
"""
Scraps HTML by cadaster code. This probably will return several entries (Multiparcela), since a non-full cadaster
belongs to entire buildings. But sometimes it will return just one entry (Parcela) in case of, for example,
country houses.
:param cadaster: 14-characters code of a cadaster
:param prov_num: Province number
:param city_num: City number
:param pictures: True if we want to obtain the house plan picture. False otherwise.
:return: A List of CadasterEntry objects.
"""
rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2)
@ -59,21 +87,21 @@ class ScrapperHTML(Scrapper):
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
if delimitacion is None:
if prov_num is None:
delimitacion_search = re.search(r'del=([0-9]+)&', html)
if delimitacion_search:
delimitacion = delimitacion_search.group(1)
prov_num = delimitacion_search.group(1)
if municipio is None:
if city_num is None:
municipio_search = re.search(r'mun=([0-9]+)&', html)
if municipio_search:
municipio = municipio_search.group(1)
city_num = municipio_search.group(1)
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
picture = None
if pictures:
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([rc_1, rc_2]))
sleep(config['sleep_time'])
htmls = []
@ -88,7 +116,7 @@ class ScrapperHTML(Scrapper):
partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip()
parsed_html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
parsed_html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, prov_num, city_num)
htmls.append((parsed_html, picture))
sleep(config['sleep_time'])
else:

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib.parse
from time import sleep
@ -14,12 +17,19 @@ logger = CadastroLogger(__name__).logger
class ScrapperXML(Scrapper):
"""Class that manages the XML scrapping from the Cadastro webservices """
def __init__(self):
super().__init__()
@classmethod
def get_coord(cls,x, y):
def scrap_coord(cls, x, y):
"""
Scraps XML by coordinates
:param x: Longitude
:param y: Latitude
:return: DotMap dictionary with scrapped results
"""
params = {'SRS': 'EPSG:4326', 'Coordenada_X': x, 'Coordenada_Y': y}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params)
@ -33,12 +43,18 @@ class ScrapperXML(Scrapper):
return xml_dict_map
@classmethod
def get_cadaster_entries_by_cadaster(cls, prov_name, city_name, rc):
""" provincia and municipio are optional and can be set to '' """
def get_cadaster_entries_by_cadaster(cls, prov_name, city_name, cadaster):
"""
Scraps XML by cadaster, prov_name (optional) and city_name (optional)
:param prov_name: Name of the province (can be set to '')
:param city_name: Name of the city (can be set to '')
:param cadaster: Cadaster code
:return: DotMap dictionary with scrapped results
"""
params = {"Provincia": prov_name,
"Municipio": city_name,
"RC": rc}
"RC": cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
@ -49,27 +65,40 @@ class ScrapperXML(Scrapper):
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
planta=None,puerta=None):
params = {'Provincia': provincia,
'Municipio': municipio,
'Sigla': sigla,
'Calle': calle,
'Numero': str(numero)}
if bloque:
params['Bloque'] = str(bloque)
def get_cadaster_entries_by_address(cls, prov_name, city_name, tv, nv, num, bl=None, es=None,
pl=None, pu=None):
"""
Scraps XML by address
:param prov_name: Name of the province (can be set to '')
:param city_name: Name of the city (can be set to '')
:param tv: Kind of street (CL - Calle, AV - Avenida, etc)
:param nv: Name of street
:param num: Street number
:param bl: Block (Bloque)
:param es: Doorway (Escalera)
:param pl: Floor (Planta)
:param pu: Door (Puerta)
:return: DotMap dictionary with scrapped results
"""
params = {'Provincia': prov_name,
'Municipio': city_name,
'Sigla': tv,
'Calle': nv,
'Numero': str(num)}
if bl:
params['Bloque'] = str(bl)
else:
params['Bloque'] = ''
if escalera:
params['Escalera'] = escalera
if es:
params['Escalera'] = es
else:
params['Escalera'] = ''
if planta:
params['Planta'] = str(planta)
if pl:
params['Planta'] = str(pl)
else:
params['Planta'] = ''
if puerta:
params['Puerta'] = str(puerta)
if pu:
params['Puerta'] = str(pu)
else:
params['Puerta'] = ''

View File

@ -18,11 +18,22 @@ logger = CadastroLogger(__name__).logger
class CoordinatesSearcher(Searcher):
"""
Class that inheritates from Searcher Abstract Class and implements
functions regarding coordinates search.
"""
def __init__(self):
super().__init__()
@classmethod
def search_by_coordinates(cls, scrapper, filenames, pictures=False):
"""
Function that searches Cadastro (HTML or XML) by coordinates
:param scrapper: HTMLScrapper or XMLScrapper classes
:param filenames: Names of the filenames with coordinates to scrap
:param pictures: Do we want to scrap house plan pictures?
"""
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
@ -40,6 +51,14 @@ class CoordinatesSearcher(Searcher):
@classmethod
def search_in_polygon(cls, scrapper, polygon, pictures=False):
"""
Function that searchs by coordinates strictly inside a Polygon
defined by the user.
:param scrapper: HTMLScrapper or XMLScrapper classes
:param polygon: a GeoPolygon class object
:param pictures: Do we want to scrap house plan pictures?
"""
bb = polygon.get_bounding_box()
lon_min = int(bb[0] * config['scale'])
lon_max = int(bb[2] * config['scale'])
@ -78,6 +97,18 @@ class CoordinatesSearcher(Searcher):
@staticmethod
def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
"""
Function that allows searching in lon, lat for a maximum number of seconds.
Mainly used for debugging purposes.
:param seconds: Total of seconds to scrap
:param lon_min: Minimum longitude
:param lon_max: Maximum longitude
:param lat_min: Minimum latitude
:param lat_max: Maximum latitude
:param scrapper: HTML or XML Scrapper
:return: a List of CadasterEntry objects
"""
start_time = time.time()
results = []
@ -119,6 +150,18 @@ class CoordinatesSearcher(Searcher):
@staticmethod
def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
"""
Function that allows searching in lon, lat for a maximum number of matches.
Mainly used for debugging purposes.
:param matches: Total of matches to scrap
:param lon_min: Minimum longitude
:param lon_max: Maximum longitude
:param lat_min: Minimum latitude
:param lat_max: Maximum latitude
:param scrapper: HTML or XML Scrapper
:return: a List of CadasterEntry objects
"""
results = []
counter = matches
@ -160,9 +203,21 @@ class CoordinatesSearcher(Searcher):
return ListUtils.flat(results)
@staticmethod
def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, parser):
def search_by_coordinates_random_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
"""
Function that allows searching in lon, lat for a maximum number of matches.
Mainly used for debugging purposes.
:param matches: Total of matches to scrap
:param lon_min: Minimum longitude
:param lon_max: Maximum longitude
:param lat_min: Minimum latitude
:param lat_max: Maximum latitude
:param scrapper: HTML or XML Scrapper
:return: a List of CadasterEntry objects
"""
results = []
counter = times
counter = matches
while counter > 0:
x = random.randrange(lon_min, lon_max)
@ -172,7 +227,7 @@ class CoordinatesSearcher(Searcher):
y_scaled = y / config['scale']
try:
cadaster_entry = parser.process_search_by_coordinates(x_scaled, y_scaled)
cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
if len(cadaster_entry) > 0:
results.append(cadaster_entry)
@ -194,5 +249,5 @@ class CoordinatesSearcher(Searcher):
sleep(config['sleep_time'])
logger.debug("====PROCESSING FINISHED====")
logger.debug("Results found: {}".format(times))
logger.debug("Results found: {}".format(matches))
return ListUtils.flat(results)

View File

@ -10,15 +10,31 @@ logger = CadastroLogger(__name__).logger
class ProvincesSearcher(Searcher):
"""
Class that allows searching Cadastro by provinces, cities, addresses
"""
def __init__(self):
super().__init__()
@classmethod
def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
"""
Searchs Cadastro by a list of provinces. We can optionally set if we want
pictures to be scrapped as well (of the house plan) or if we want to start from
a specific city. Example: I want to scrap Madrid province starting alphabetically
from 'Fuenlabrada'
:param scrapper: XML or HTML Scrapper
:param prov_list: List of province names
:param pictures: True if we want house plan pictures to be scrapped. False otherwise.
:param start_from: Name of the city we want to start from (from the first province)
"""
scrapper.process_search_by_provinces(prov_list, pictures, start_from)
@classmethod
def list_provinces(cls):
"""
Lists province names from Cadastro
"""
dotmap = Scrapper.get_provinces()
provinces = dotmap.consulta_provinciero.provinciero.prov
for province in provinces:
@ -26,6 +42,9 @@ class ProvincesSearcher(Searcher):
@classmethod
def list_cities(cls, prov_name):
"""
Lists city names from Cadastro
"""
dotmap = Scrapper.get_cities(prov_name)
cities = dotmap.consulta_municipiero.municipiero.muni
for city in cities:

View File

@ -13,7 +13,7 @@ class ParserHTMLTests(unittest.TestCase):
self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], matches=1)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertTrue(cadaster.from_elasticsearch())

View File

@ -21,7 +21,7 @@ class ParserXMLTests(unittest.TestCase):
self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], matches=1)
self.assertEqual(len(cadaster_list), 1)
sleep(5)
for cadaster in cadaster_list:

View File

@ -111,7 +111,7 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CANARIAS'
num = 7
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
def test_multiparcela_address_creates_n_entries_2(self):
prov_name = u'MADRID'
@ -120,7 +120,7 @@ class ScrapperXMLTests(unittest.TestCase):
nv = u'CALVARIO'
num = 38
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
def test_poligono_or_rural_creates_entry(self):
tv = 'CL'
@ -129,7 +129,7 @@ class ScrapperXMLTests(unittest.TestCase):
prov_name = 'MADRID'
city_name = 'AJALVIR'
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
def test_coordinates_are_in_good_format(self):
tv = 'CL'
@ -138,7 +138,7 @@ class ScrapperXMLTests(unittest.TestCase):
prov_name = 'MADRID'
city_name = 'GALAPAGAR'
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
cadaster_entry = ParserXML.parse_xml_by_address(xml, prov_name, city_name, tv, nv, False)
self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374)
self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211)
@ -149,7 +149,7 @@ class ScrapperXMLTests(unittest.TestCase):
prov_name = 'ALACANT'
city_name = 'ALICANTE/ALACANT'
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
cadaster_entries = ParserXML.parse_xml_by_address(xml, prov_name, city_name, tv, nv, False)
for cadaster_entry in cadaster_entries:
self.assertEqual(cadaster_entry.location.lat, 38.3495195831056)
self.assertEqual(cadaster_entry.location.lon, -0.484612452235845)

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import logging.config
import sys
@ -40,5 +43,4 @@ class CadastroLogger:
self.logger.addHandler(debug_file_handler)
self.logger.addHandler(error_file_handler)
self.logger.addHandler(tracking_file_handler)
pass

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from dotmap import DotMap
from elasticsearch import Elasticsearch
@ -15,6 +18,9 @@ class ElasticSearchUtils:
@staticmethod
def create_index():
"""
Creates index in ElasticSearch
"""
ElasticSearchUtils.remove_index()
es = Elasticsearch()
request_body = {
@ -78,6 +84,9 @@ class ElasticSearchUtils:
@staticmethod
def remove_index():
"""
Removes index from ElasticSearch
"""
es = Elasticsearch()
logger.debug("Deleting 'cadaster' index...")
try:
@ -90,6 +99,13 @@ class ElasticSearchUtils:
@staticmethod
def check_if_address_present(address, city_name, province_name):
"""
Checks if an address has been already scrapped (to skip it).
:param address: full addres (including tipo de via, nombre de via ...)
:param city_name: City Name
:param province_name: Province Name
:return: True if already scrapped, False otherwise
"""
res = False
query = {"query":
{"bool":

View File

@ -1,7 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
class JSONEncoder(json.JSONEncoder):
"""
Class that recursively encodes classes into json dictionaries
"""
def default(self, obj):
if hasattr(obj, 'to_json'):
return obj.to_json()

View File

@ -1,3 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
class ListUtils:
""" Different functions for make working with lists easier"""
def __init__(self):
@ -5,4 +9,9 @@ class ListUtils:
@staticmethod
def flat(non_flat_list):
"""
Flattens a multilevel list [[], []...] -> [, , , ]
:param non_flat_list: Multilevel list
:return: A flattened list
"""
return [item for sublist in non_flat_list for item in sublist]