mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-07-06 15:22:28 +02:00
Adds final documentation of most of functions and methods.
This commit is contained in:
parent
6c6da34adf
commit
6a36266886
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
from collections import namedtuple
|
||||
|
||||
@ -10,6 +13,10 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class GeoPolygon:
|
||||
"""
|
||||
A GeoPolygon is a series of lon,lat points in a json. This class uses shapely.geometry
|
||||
to convert points into Point objects and these into a Polygon class.
|
||||
"""
|
||||
|
||||
def __init__(self, file):
|
||||
self.polygon = None
|
||||
@ -27,10 +34,20 @@ class GeoPolygon:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
def is_point_in_polygon(self, lon, lat):
|
||||
"""
|
||||
Check if a point (lot, lat) is inside this Polygon
|
||||
:param lon: longitude
|
||||
:param lat: latitude
|
||||
:return: True if point is inside polygon. False otherwise
|
||||
"""
|
||||
p = Point(lon, lat)
|
||||
return self.polygon.contains(p)
|
||||
|
||||
def get_bounding_box(self):
|
||||
"""
|
||||
Gets the bounding box of a polygon
|
||||
:return: A Box object from shapely.geometry containing inside the Polygon
|
||||
"""
|
||||
if self.polygon is not None:
|
||||
return self.polygon.bounds
|
||||
else:
|
||||
|
@ -21,7 +21,7 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ParserHTML(Parser):
|
||||
"""Parser class for Catastro HTML"""
|
||||
"""Class that manages the processing of scrapped HTML from Cadastro webpage"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -35,6 +35,13 @@ class ParserHTML(Parser):
|
||||
""" Processing """
|
||||
@classmethod
|
||||
def process_search_by_coordinates(cls, x, y, pictures=False):
|
||||
"""
|
||||
Searches by coordinate from HTML and processes the result.
|
||||
:param x: longitude
|
||||
:param y: latitude
|
||||
:param pictures: True if we want house plan pictures to be scrapped
|
||||
:return: List of CadasterEntry objects
|
||||
"""
|
||||
data = ScrapperHTML.scrap_coord(x, y)
|
||||
|
||||
root = ElementTree.fromstring(data)
|
||||
@ -60,8 +67,15 @@ class ParserHTML(Parser):
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
|
||||
|
||||
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', matches=None):
|
||||
"""
|
||||
Searches by province from HTML and processes the result.
|
||||
:param prov_list: List of province names
|
||||
:param start_from: Name of the city of the first province to start from
|
||||
:param pictures: True if we want house plan pictures to be scrapped
|
||||
:param matches: Max number of matches (for debugging purporses mainly)
|
||||
:return: List of CadasterEntry objects
|
||||
"""
|
||||
times = 0
|
||||
results = []
|
||||
|
||||
@ -129,7 +143,7 @@ class ParserHTML(Parser):
|
||||
counter += 1
|
||||
times += 1
|
||||
|
||||
if max_times is not None and times >= max_times:
|
||||
if matches is not None and times >= matches:
|
||||
return results
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
@ -157,6 +171,13 @@ class ParserHTML(Parser):
|
||||
""" Parsing """
|
||||
@classmethod
|
||||
def parse_html_parcela(cls, parsed_html, x=None, y=None, picture=None):
|
||||
"""
|
||||
Parses an HTML and crates a CadasterEntry object
|
||||
:param x: longitude obtained previously
|
||||
:param y: latitude obtained previously
|
||||
:param pictures: base64 picture obtained previously
|
||||
:return: CadasterEntry object
|
||||
"""
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
descriptive_data = dict()
|
||||
|
@ -4,9 +4,6 @@
|
||||
import urllib.parse
|
||||
from urllib import error
|
||||
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
||||
from src.librecatastro.scrapping.parser import Parser
|
||||
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||
@ -23,7 +20,7 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ParserXML(Parser):
|
||||
"""Parser class for Catastro XML"""
|
||||
"""Class that manages the processing of scrapped XML from Cadastro webservices"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -31,11 +28,17 @@ class ParserXML(Parser):
|
||||
''' Processing calls '''
|
||||
@classmethod
|
||||
def process_search_by_coordinates(cls, x, y, pictures=False):
|
||||
"""Scraps properties by coordinates"""
|
||||
"""
|
||||
Searches by coordinate from XML and processes the result.
|
||||
:param x: longitude
|
||||
:param y: latitude
|
||||
:param pictures: True if we want house plan pictures to be scrapped
|
||||
:return: List of CadasterEntry objects
|
||||
"""
|
||||
|
||||
results = []
|
||||
|
||||
xml_dict_map = ScrapperXML.get_coord(x, y)
|
||||
xml_dict_map = ScrapperXML.scrap_coord(x, y)
|
||||
pc1 = None
|
||||
pc2 = None
|
||||
if xml_dict_map.consulta_coordenadas.coordenadas.coord.pc != DotMap():
|
||||
@ -127,8 +130,15 @@ class ParserXML(Parser):
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
|
||||
|
||||
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', matches=None):
|
||||
"""
|
||||
Searches by province from XML and processes the result.
|
||||
:param prov_list: List of province names
|
||||
:param start_from: Name of the city of the first province to start from
|
||||
:param pictures: True if we want house plan pictures to be scrapped
|
||||
:param matches: Max number of matches (for debugging purporses mainly)
|
||||
:return: List of CadasterEntry objects
|
||||
"""
|
||||
times = 0
|
||||
results = []
|
||||
|
||||
@ -146,14 +156,14 @@ class ParserXML(Parser):
|
||||
while num_scrapping_fails > 0:
|
||||
try:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
|
||||
res = cls.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, counter, pictures)
|
||||
if len(res) < 1:
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
num_scrapping_fails = 10
|
||||
times += 1
|
||||
results.append(res)
|
||||
if max_times is not None and times >= max_times:
|
||||
if matches is not None and times >= matches:
|
||||
return ListUtils.flat(results)
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
@ -182,7 +192,18 @@ class ParserXML(Parser):
|
||||
''' Parsing calls '''
|
||||
|
||||
@classmethod
|
||||
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
|
||||
def parse_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, num, pictures=False):
|
||||
"""
|
||||
Parses an XML and crates a CadasterEntry object
|
||||
:param numerero_map: DotMap obtained from a previous call with information about the address to parse
|
||||
:param prov_name: Province Name
|
||||
:param city_name: City Name
|
||||
:param tv: Kind of way (Tipo de Via) - CL (calle), AV (Avenida) ...
|
||||
:param nv: Street name (Nombre de via)
|
||||
:param num: Street number (Numero de via)
|
||||
:param pictures: True if we want to scrap also house plan pictures. False otherwise.
|
||||
:return: List of CadasterEntry objects
|
||||
"""
|
||||
results = []
|
||||
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
|
||||
return results
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import base64
|
||||
from time import sleep
|
||||
from urllib.request import urlopen
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
from time import sleep
|
||||
from urllib.request import urlopen
|
||||
@ -13,8 +16,9 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ScrapperHTML(Scrapper):
|
||||
"""HTML Catastro Scrapper"""
|
||||
"""Class that manages the HTML scrapping from the Cadastro Official Page """
|
||||
|
||||
''' Some reference URLs'''
|
||||
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4326&Coordenada_X={}&Coordenada_Y={}"
|
||||
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
||||
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
||||
@ -24,6 +28,12 @@ class ScrapperHTML(Scrapper):
|
||||
|
||||
@classmethod
|
||||
def scrap_coord(cls, x, y):
|
||||
"""
|
||||
Scraps HTML by coordinates
|
||||
:param x: Longitude
|
||||
:param y: Latitude
|
||||
:return: HTML content of the cadaster entry
|
||||
"""
|
||||
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
|
||||
url = cls.URL.format(x, y)
|
||||
logger.debug("URL for coordinates: {}".format(url))
|
||||
@ -33,8 +43,15 @@ class ScrapperHTML(Scrapper):
|
||||
return f.read()
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
|
||||
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||
def scrap_cadaster_full_code(cls, full_cadaster, prov_num, city_num):
|
||||
"""
|
||||
Scraps HTML by cadaster full code, province (delimitacion) and city (municipio)
|
||||
:param full_cadaster: Full cadaster code (>14 characters)
|
||||
:param prov_num: Province number
|
||||
:param city_num: City number
|
||||
:return: BeautifulSoup-parsed HTML content of the cadaster entry
|
||||
"""
|
||||
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, prov_num, city_num)
|
||||
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
@ -45,7 +62,18 @@ class ScrapperHTML(Scrapper):
|
||||
return parsed_html
|
||||
|
||||
@classmethod
|
||||
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
|
||||
def scrap_cadaster(cls, cadaster, prov_num=None, city_num=None, pictures=False):
|
||||
"""
|
||||
Scraps HTML by cadaster code. This probably will return several entries (Multiparcela), since a non-full cadaster
|
||||
belongs to entire buildings. But sometimes it will return just one entry (Parcela) in case of, for example,
|
||||
country houses.
|
||||
|
||||
:param cadaster: 14-characters code of a cadaster
|
||||
:param prov_num: Province number
|
||||
:param city_num: City number
|
||||
:param pictures: True if we want to obtain the house plan picture. False otherwise.
|
||||
:return: A List of CadasterEntry objects.
|
||||
"""
|
||||
rc_1 = cadaster[0:7]
|
||||
rc_2 = cadaster[7:14]
|
||||
url_ref = cls.URL_REF.format(rc_1, rc_2)
|
||||
@ -59,21 +87,21 @@ class ScrapperHTML(Scrapper):
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
|
||||
if delimitacion is None:
|
||||
if prov_num is None:
|
||||
delimitacion_search = re.search(r'del=([0-9]+)&', html)
|
||||
if delimitacion_search:
|
||||
delimitacion = delimitacion_search.group(1)
|
||||
prov_num = delimitacion_search.group(1)
|
||||
|
||||
if municipio is None:
|
||||
if city_num is None:
|
||||
municipio_search = re.search(r'mun=([0-9]+)&', html)
|
||||
if municipio_search:
|
||||
municipio = municipio_search.group(1)
|
||||
city_num = municipio_search.group(1)
|
||||
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
picture = None
|
||||
if pictures:
|
||||
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
|
||||
picture = cls.scrap_site_picture(prov_num, city_num, ''.join([rc_1, rc_2]))
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
htmls = []
|
||||
@ -88,7 +116,7 @@ class ScrapperHTML(Scrapper):
|
||||
partial_cadaster_ref = partial_cadaster.find("b")
|
||||
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||
parsed_html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
|
||||
parsed_html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, prov_num, city_num)
|
||||
htmls.append((parsed_html, picture))
|
||||
sleep(config['sleep_time'])
|
||||
else:
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import urllib.parse
|
||||
from time import sleep
|
||||
|
||||
@ -14,12 +17,19 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ScrapperXML(Scrapper):
|
||||
"""Class that manages the XML scrapping from the Cadastro webservices """
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def get_coord(cls,x, y):
|
||||
def scrap_coord(cls, x, y):
|
||||
"""
|
||||
Scraps XML by coordinates
|
||||
:param x: Longitude
|
||||
:param y: Latitude
|
||||
:return: DotMap dictionary with scrapped results
|
||||
"""
|
||||
params = {'SRS': 'EPSG:4326', 'Coordenada_X': x, 'Coordenada_Y': y}
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
|
||||
response = requests.get(url, params=params)
|
||||
@ -33,12 +43,18 @@ class ScrapperXML(Scrapper):
|
||||
return xml_dict_map
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_entries_by_cadaster(cls, prov_name, city_name, rc):
|
||||
""" provincia and municipio are optional and can be set to '' """
|
||||
def get_cadaster_entries_by_cadaster(cls, prov_name, city_name, cadaster):
|
||||
"""
|
||||
Scraps XML by cadaster, prov_name (optional) and city_name (optional)
|
||||
:param prov_name: Name of the province (can be set to '')
|
||||
:param city_name: Name of the city (can be set to '')
|
||||
:param cadaster: Cadaster code
|
||||
:return: DotMap dictionary with scrapped results
|
||||
"""
|
||||
|
||||
params = {"Provincia": prov_name,
|
||||
"Municipio": city_name,
|
||||
"RC": rc}
|
||||
"RC": cadaster}
|
||||
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
|
||||
logger.debug("URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||
@ -49,27 +65,40 @@ class ScrapperXML(Scrapper):
|
||||
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
|
||||
planta=None,puerta=None):
|
||||
params = {'Provincia': provincia,
|
||||
'Municipio': municipio,
|
||||
'Sigla': sigla,
|
||||
'Calle': calle,
|
||||
'Numero': str(numero)}
|
||||
if bloque:
|
||||
params['Bloque'] = str(bloque)
|
||||
def get_cadaster_entries_by_address(cls, prov_name, city_name, tv, nv, num, bl=None, es=None,
|
||||
pl=None, pu=None):
|
||||
"""
|
||||
Scraps XML by address
|
||||
:param prov_name: Name of the province (can be set to '')
|
||||
:param city_name: Name of the city (can be set to '')
|
||||
:param tv: Kind of street (CL - Calle, AV - Avenida, etc)
|
||||
:param nv: Name of street
|
||||
:param num: Street number
|
||||
:param bl: Block (Bloque)
|
||||
:param es: Doorway (Escalera)
|
||||
:param pl: Floor (Planta)
|
||||
:param pu: Door (Puerta)
|
||||
:return: DotMap dictionary with scrapped results
|
||||
"""
|
||||
params = {'Provincia': prov_name,
|
||||
'Municipio': city_name,
|
||||
'Sigla': tv,
|
||||
'Calle': nv,
|
||||
'Numero': str(num)}
|
||||
if bl:
|
||||
params['Bloque'] = str(bl)
|
||||
else:
|
||||
params['Bloque'] = ''
|
||||
if escalera:
|
||||
params['Escalera'] = escalera
|
||||
if es:
|
||||
params['Escalera'] = es
|
||||
else:
|
||||
params['Escalera'] = ''
|
||||
if planta:
|
||||
params['Planta'] = str(planta)
|
||||
if pl:
|
||||
params['Planta'] = str(pl)
|
||||
else:
|
||||
params['Planta'] = ''
|
||||
if puerta:
|
||||
params['Puerta'] = str(puerta)
|
||||
if pu:
|
||||
params['Puerta'] = str(pu)
|
||||
else:
|
||||
params['Puerta'] = ''
|
||||
|
||||
|
@ -18,11 +18,22 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class CoordinatesSearcher(Searcher):
|
||||
"""
|
||||
Class that inheritates from Searcher Abstract Class and implements
|
||||
functions regarding coordinates search.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def search_by_coordinates(cls, scrapper, filenames, pictures=False):
|
||||
"""
|
||||
Function that searches Cadastro (HTML or XML) by coordinates
|
||||
:param scrapper: HTMLScrapper or XMLScrapper classes
|
||||
:param filenames: Names of the filenames with coordinates to scrap
|
||||
:param pictures: Do we want to scrap house plan pictures?
|
||||
|
||||
"""
|
||||
for r, d, files in os.walk(config['coordinates_path']):
|
||||
for file in files:
|
||||
|
||||
@ -40,6 +51,14 @@ class CoordinatesSearcher(Searcher):
|
||||
|
||||
@classmethod
|
||||
def search_in_polygon(cls, scrapper, polygon, pictures=False):
|
||||
"""
|
||||
Function that searchs by coordinates strictly inside a Polygon
|
||||
defined by the user.
|
||||
|
||||
:param scrapper: HTMLScrapper or XMLScrapper classes
|
||||
:param polygon: a GeoPolygon class object
|
||||
:param pictures: Do we want to scrap house plan pictures?
|
||||
"""
|
||||
bb = polygon.get_bounding_box()
|
||||
lon_min = int(bb[0] * config['scale'])
|
||||
lon_max = int(bb[2] * config['scale'])
|
||||
@ -78,6 +97,18 @@ class CoordinatesSearcher(Searcher):
|
||||
|
||||
@staticmethod
|
||||
def search_by_coordinates_max_time(seconds, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
"""
|
||||
Function that allows searching in lon, lat for a maximum number of seconds.
|
||||
Mainly used for debugging purposes.
|
||||
|
||||
:param seconds: Total of seconds to scrap
|
||||
:param lon_min: Minimum longitude
|
||||
:param lon_max: Maximum longitude
|
||||
:param lat_min: Minimum latitude
|
||||
:param lat_max: Maximum latitude
|
||||
:param scrapper: HTML or XML Scrapper
|
||||
:return: a List of CadasterEntry objects
|
||||
"""
|
||||
start_time = time.time()
|
||||
results = []
|
||||
|
||||
@ -119,6 +150,18 @@ class CoordinatesSearcher(Searcher):
|
||||
|
||||
@staticmethod
|
||||
def search_by_coordinates_linear_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
"""
|
||||
Function that allows searching in lon, lat for a maximum number of matches.
|
||||
Mainly used for debugging purposes.
|
||||
|
||||
:param matches: Total of matches to scrap
|
||||
:param lon_min: Minimum longitude
|
||||
:param lon_max: Maximum longitude
|
||||
:param lat_min: Minimum latitude
|
||||
:param lat_max: Maximum latitude
|
||||
:param scrapper: HTML or XML Scrapper
|
||||
:return: a List of CadasterEntry objects
|
||||
"""
|
||||
results = []
|
||||
counter = matches
|
||||
|
||||
@ -160,9 +203,21 @@ class CoordinatesSearcher(Searcher):
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, parser):
|
||||
def search_by_coordinates_random_max_n_matches(matches, lon_min, lon_max, lat_min, lat_max, scrapper):
|
||||
"""
|
||||
Function that allows searching in lon, lat for a maximum number of matches.
|
||||
Mainly used for debugging purposes.
|
||||
|
||||
:param matches: Total of matches to scrap
|
||||
:param lon_min: Minimum longitude
|
||||
:param lon_max: Maximum longitude
|
||||
:param lat_min: Minimum latitude
|
||||
:param lat_max: Maximum latitude
|
||||
:param scrapper: HTML or XML Scrapper
|
||||
:return: a List of CadasterEntry objects
|
||||
"""
|
||||
results = []
|
||||
counter = times
|
||||
counter = matches
|
||||
while counter > 0:
|
||||
|
||||
x = random.randrange(lon_min, lon_max)
|
||||
@ -172,7 +227,7 @@ class CoordinatesSearcher(Searcher):
|
||||
y_scaled = y / config['scale']
|
||||
|
||||
try:
|
||||
cadaster_entry = parser.process_search_by_coordinates(x_scaled, y_scaled)
|
||||
cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
|
||||
|
||||
if len(cadaster_entry) > 0:
|
||||
results.append(cadaster_entry)
|
||||
@ -194,5 +249,5 @@ class CoordinatesSearcher(Searcher):
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
logger.debug("====PROCESSING FINISHED====")
|
||||
logger.debug("Results found: {}".format(times))
|
||||
logger.debug("Results found: {}".format(matches))
|
||||
return ListUtils.flat(results)
|
||||
|
@ -10,15 +10,31 @@ logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class ProvincesSearcher(Searcher):
|
||||
"""
|
||||
Class that allows searching Cadastro by provinces, cities, addresses
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def search_by_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
|
||||
"""
|
||||
Searchs Cadastro by a list of provinces. We can optionally set if we want
|
||||
pictures to be scrapped as well (of the house plan) or if we want to start from
|
||||
a specific city. Example: I want to scrap Madrid province starting alphabetically
|
||||
from 'Fuenlabrada'
|
||||
:param scrapper: XML or HTML Scrapper
|
||||
:param prov_list: List of province names
|
||||
:param pictures: True if we want house plan pictures to be scrapped. False otherwise.
|
||||
:param start_from: Name of the city we want to start from (from the first province)
|
||||
"""
|
||||
scrapper.process_search_by_provinces(prov_list, pictures, start_from)
|
||||
|
||||
@classmethod
|
||||
def list_provinces(cls):
|
||||
"""
|
||||
Lists province names from Cadastro
|
||||
"""
|
||||
dotmap = Scrapper.get_provinces()
|
||||
provinces = dotmap.consulta_provinciero.provinciero.prov
|
||||
for province in provinces:
|
||||
@ -26,6 +42,9 @@ class ProvincesSearcher(Searcher):
|
||||
|
||||
@classmethod
|
||||
def list_cities(cls, prov_name):
|
||||
"""
|
||||
Lists city names from Cadastro
|
||||
"""
|
||||
dotmap = Scrapper.get_cities(prov_name)
|
||||
cities = dotmap.consulta_municipiero.municipiero.muni
|
||||
for city in cities:
|
||||
|
@ -13,7 +13,7 @@ class ParserHTMLTests(unittest.TestCase):
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], matches=1)
|
||||
self.assertEqual(len(cadaster_list), 14)
|
||||
for cadaster in cadaster_list:
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
@ -21,7 +21,7 @@ class ParserXMLTests(unittest.TestCase):
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], matches=1)
|
||||
self.assertEqual(len(cadaster_list), 1)
|
||||
sleep(5)
|
||||
for cadaster in cadaster_list:
|
||||
|
@ -111,7 +111,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
nv = u'CANARIAS'
|
||||
num = 7
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||
|
||||
def test_multiparcela_address_creates_n_entries_2(self):
|
||||
prov_name = u'MADRID'
|
||||
@ -120,7 +120,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
nv = u'CALVARIO'
|
||||
num = 38
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 8)
|
||||
|
||||
def test_poligono_or_rural_creates_entry(self):
|
||||
tv = 'CL'
|
||||
@ -129,7 +129,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
prov_name = 'MADRID'
|
||||
city_name = 'AJALVIR'
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||
self.assertEqual(len(ParserXML.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
|
||||
self.assertEqual(len(ParserXML.parse_xml_by_address(cadaster, prov_name, city_name, tv, nv, False)), 16)
|
||||
|
||||
def test_coordinates_are_in_good_format(self):
|
||||
tv = 'CL'
|
||||
@ -138,7 +138,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
prov_name = 'MADRID'
|
||||
city_name = 'GALAPAGAR'
|
||||
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||
cadaster_entry = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||
cadaster_entry = ParserXML.parse_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||
self.assertEqual(cadaster_entry[0].location.lat, 40.6249762551374)
|
||||
self.assertEqual(cadaster_entry[0].location.lon, -4.02755522611211)
|
||||
|
||||
@ -149,7 +149,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
prov_name = 'ALACANT'
|
||||
city_name = 'ALICANTE/ALACANT'
|
||||
xml = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, num)
|
||||
cadaster_entries = ParserXML.process_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||
cadaster_entries = ParserXML.parse_xml_by_address(xml, prov_name, city_name, tv, nv, False)
|
||||
for cadaster_entry in cadaster_entries:
|
||||
self.assertEqual(cadaster_entry.location.lat, 38.3495195831056)
|
||||
self.assertEqual(cadaster_entry.location.lon, -0.484612452235845)
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import logging.config
|
||||
import sys
|
||||
@ -40,5 +43,4 @@ class CadastroLogger:
|
||||
self.logger.addHandler(debug_file_handler)
|
||||
self.logger.addHandler(error_file_handler)
|
||||
self.logger.addHandler(tracking_file_handler)
|
||||
pass
|
||||
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from dotmap import DotMap
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
@ -15,6 +18,9 @@ class ElasticSearchUtils:
|
||||
|
||||
@staticmethod
|
||||
def create_index():
|
||||
"""
|
||||
Creates index in ElasticSearch
|
||||
"""
|
||||
ElasticSearchUtils.remove_index()
|
||||
es = Elasticsearch()
|
||||
request_body = {
|
||||
@ -78,6 +84,9 @@ class ElasticSearchUtils:
|
||||
|
||||
@staticmethod
|
||||
def remove_index():
|
||||
"""
|
||||
Removes index from ElasticSearch
|
||||
"""
|
||||
es = Elasticsearch()
|
||||
logger.debug("Deleting 'cadaster' index...")
|
||||
try:
|
||||
@ -90,6 +99,13 @@ class ElasticSearchUtils:
|
||||
|
||||
@staticmethod
|
||||
def check_if_address_present(address, city_name, province_name):
|
||||
"""
|
||||
Checks if an address has been already scrapped (to skip it).
|
||||
:param address: full addres (including tipo de via, nombre de via ...)
|
||||
:param city_name: City Name
|
||||
:param province_name: Province Name
|
||||
:return: True if already scrapped, False otherwise
|
||||
"""
|
||||
res = False
|
||||
query = {"query":
|
||||
{"bool":
|
||||
|
@ -1,7 +1,13 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
|
||||
|
||||
class JSONEncoder(json.JSONEncoder):
|
||||
"""
|
||||
Class that recursively encodes classes into json dictionaries
|
||||
"""
|
||||
def default(self, obj):
|
||||
if hasattr(obj, 'to_json'):
|
||||
return obj.to_json()
|
||||
|
@ -1,3 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
class ListUtils:
|
||||
""" Different functions for make working with lists easier"""
|
||||
def __init__(self):
|
||||
@ -5,4 +9,9 @@ class ListUtils:
|
||||
|
||||
@staticmethod
|
||||
def flat(non_flat_list):
|
||||
"""
|
||||
Flattens a multilevel list [[], []...] -> [, , , ]
|
||||
:param non_flat_list: Multilevel list
|
||||
:return: A flattened list
|
||||
"""
|
||||
return [item for sublist in non_flat_list for item in sublist]
|
||||
|
Loading…
Reference in New Issue
Block a user