Fixes XML scrapping for processing optional arguments. Removes bounding boxes to be eventually changed to polygons. Adds parameters to process by province.

This commit is contained in:
J 2019-09-18 18:24:53 +02:00
parent 0478146b27
commit 9f7d5fda51
28 changed files with 282 additions and 344 deletions

View File

@ -16,6 +16,6 @@ services:
ports:
- "9200:9200"
kibana:
image: docker.elastic.co/kibana/kibana:6.3.2
image: docker.elastic.co/geometry/geometry:6.3.2
ports:
- "5601:5601"

View File

@ -7,11 +7,11 @@ from src.librecatastro.scrapping.scrapper_xml import ScrapperXML
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Runs the Cadastro Parser')
parser.add_argument('--coords', dest='coords', default=False, action='store_true')
parser.add_argument('--filename', nargs=1, dest='filename', default='')
parser.add_argument('--filenames', action='store', nargs='+', dest='filenames', default=[])
parser.add_argument('--provinces', action='store', nargs='+', dest='provinces', default=[])
args = parser.parse_args(sys.argv[1:])
if args.coords:
ScrapperHTML.scrap_all_coordinates_files(args['filename'])
ScrapperHTML.scrap_all_coordinates_files(args.filenames)
else:
ScrapperXML.scrap_all_addresses()
ScrapperXML.scrap_all_addresses(args.provinces)

View File

@ -1,3 +1,4 @@
shapely
beautifulsoup4==4.8.0
elasticsearch>=6.0.0,<7.0.0
requests==2.22.0

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 36.686041276581925,
"lon": -2.0214843750000004
},
"top_left": {
"lat": 38.324420427006544,
"lon": -7.514648437500001
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.995785386420344,
"lon": -2.0434570312500004
},
"top_left": {
"lat": 37.37015718405753,
"lon": -6.833496093750001
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 38.631890929028370,
"lon": 4.361572265625001
},
"top_left": {
"lat": 40.101185062587010,
"lon": 1.208496093750000
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 27.615406013399590,
"lon":-13.403320312500002
},
"top_left": {
"lat": 29.458731185355344,
"lon":-18.160400390625004
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 37.57941251343841,
"lon": -0.9008789062500001
},
"top_left": {
"lat": 41.983994270935625,
"lon": -6.218261718750001
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.869994909901720,
"lon": -5.275497436523438
},
"top_left": {
"lat": 35.922281333698294,
"lon": -5.383987426757813
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 27.638523614271946,
"lon": -17.880249023437504
},
"top_left": {
"lat": 27.847576211806295,
"lon": -18.157653808593754
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 27.72486719795934,
"lon": -15.353393554687502
},
"top_left": {
"lat": 28.173717624327864,
"lon": -15.839538574218752
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.264683153268145,
"lon": -2.927513122558594
},
"top_left": {
"lat": 35.321008047212080,
"lon": -2.972831726074218
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 37.56199695314352,
"lon": -0.81298828125
},
"top_left": {
"lat": 38.87392853923632,
"lon": -2.0214843750000004
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 42.081916678306335,
"lon": -1.7358398437500002
},
"top_left": {
"lat": 43.27720532212024,
"lon": -8.679199218750002
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 41.705728515237524,
"lon": 3.1201171875
},
"top_left": {
"lat": 42.45588764197166,
"lon": -2.7685546875000004
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 39.257778150283364,
"lon": 3.4881591796875004
},
"top_left": {
"lat": 39.96870074491696,
"lon": 2.3098754882812504
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 37.17782559332976,
"lon": -6.04248046875
},
"top_left": {
"lat": 42.27730877423709,
"lon": -8.811035156250002
}
}
}
}

View File

@ -1,15 +0,0 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 37.57941251343841,
"lon": 0.21972656250000003
},
"top_left": {
"lat": 41.32732632036624,
"lon": -1.2304687500000002
}
}
}
}

View File

@ -14,25 +14,33 @@ logger = CadastroLogger(__name__).logger
class CadasterEntryXML(CadasterEntry):
"""Cadaster class, that stores all the information about a surface and its properties"""
def __init__(self, xml, lon, lat):
def __init__(self, xml, lon, lat, is_property=True):
self.address = Address(xml['consulta_dnp']['bico']['bi']['ldt'])
self.cadaster = xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc1'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2']
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant']
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2']
type = xml['consulta_dnp']['bico']['bi']['idbi']['cn']
self.type = u'Urbano' if type == 'UR' else u'Rústico'
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso']
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2'
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant']
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant'] \
if 'debi' in xml['consulta_dnp']['bico']['bi'] and\
'ant' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.type = xml['consulta_dnp']['bico']['bi']['idbi']['cn'] if 'cn' in xml['consulta_dnp']['bico']['bi']['idbi'] else None
if self.type is not None:
self.type = u'Urbano' if self.type == 'UR' else u'Rústico'
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso'] if 'luso' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2' if 'sfc' in xml['consulta_dnp']['bico']['bi']['debi'] else None
self.location = Location(lon, lat)
self.gsurface = config['not_available_via_XML']
self.constructions = []
constructions = xml['consulta_dnp']['bico']['lcons']['cons']
constructions = []
if 'lcons' in xml['consulta_dnp']['bico']:
constructions = xml['consulta_dnp']['bico']['lcons']['cons']
''' Bad XML design, instead of returning a list with 1 element, it returns
the element'''
@ -40,20 +48,17 @@ class CadasterEntryXML(CadasterEntry):
constructions = [constructions]
for construction in constructions:
use = construction['lcd']
doorway = construction['dt']['lourb']['loint']['es']
floor = construction['dt']['lourb']['loint']['pt']
door = construction['dt']['lourb']['loint']['pt']
surface = construction['dfcons']['stl']
use = construction['lcd'] if 'lcd' in construction else None
doorway = construction['dt']['lourb']['loint']['es'] if 'dt' in construction else None
floor = construction['dt']['lourb']['loint']['pt'] if 'dt' in construction else None
door = construction['dt']['lourb']['loint']['pu'] if 'dt' in construction else None
surface = construction['dfcons']['stl'] if 'dfcons' in construction and 'stl' in construction['dfcons'] else None
reform_type = config['not_available_via_XML']
reform_date = config['not_available_via_XML']
self.constructions.append(Construction(dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type, fecha=reform_date)))
self.constructions.append(Construction(
dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type,
fecha=reform_date)))
self.timestamp = str(datetime.now())
super().__init__(self)

View File

@ -8,25 +8,25 @@ from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger
class KibanaGeoBoundingBox:
class GeoBoundingBox:
def __init__(self, data):
self.data = json.loads(data, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
def get_coordinates_tuple(self):
return KibanaGeoBoundingBox.get_coordinates_tuple_static(self.data)
return GeoBoundingBox.get_bb_from_file_static(self.data)
@staticmethod
def get_coordinates_tuple_static(data):
def get_bb_from_file_static(data):
location = data.geo_bounding_box.location
return int(location.top_left.lon * config['scale']), int(location.bottom_right.lon * config['scale']), int(location.bottom_right.lat * config['scale']), int(location.top_left.lat * config['scale'])
@staticmethod
def get_coordinate_tuple_from_file(file):
def get_bb_from_file(file):
f = open(file, "r")
content = f.read()
try:
data = json.loads(content, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
return KibanaGeoBoundingBox.get_coordinates_tuple_static(data)
return GeoBoundingBox.get_bb_from_file_static(data)
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
return None

View File

@ -0,0 +1,34 @@
import json
from collections import namedtuple
from shapely.geometry import Point, Polygon
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class GeoPolygon:
def __init__(self, file):
self.polygon = None
try:
with open(file, "r") as f:
content = f.read()
data = json.loads(content, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
points = data.geo_polygon.location.points
points_list = []
for point in points:
points_list.append((point.lon, point.lat))
self.polygon = Polygon(points_list)
except Exception as e:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
def is_point_in_polygon(self, lon, lat):
p = Point(lon, lat)
return self.polygon.contains(p)
def get_bounding_box(self):
pass

View File

@ -1,10 +1,3 @@
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''

View File

@ -11,7 +11,8 @@ from xml.etree import ElementTree
from bs4 import BeautifulSoup
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
from src.librecatastro.domain.geometry.geo_bounding_box import GeoBoundingBox
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
@ -27,43 +28,35 @@ class ScrapperHTML(Scrapper):
def __init__(self):
super().__init__()
pass
""" Scrapping main calls """
@staticmethod
def scrap_all_coordinates_files(filename=''):
@classmethod
def scrap_all_coordinates_files(cls, filenames):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
if '.json' in file and ((filename != '' and file == filename) or filename == ''):
f = open(os.path.join(config['coordinates_path'], file), "r")
content = f.read()
try:
bb = KibanaGeoBoundingBox(content)
coordinates_tuple = bb.get_coordinates_tuple()
ScrapperHTML.scrap_range_of_coordinates(coordinates_tuple[0], coordinates_tuple[1],
coordinates_tuple[2], coordinates_tuple[3])
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
if len(filenames) > 0 and file not in filenames:
continue
if '.json' not in file:
continue
try:
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
ScrapperHTML.scrap_polygon(polygon)
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
@staticmethod
def scrap_all_coordinates_files(filename=''):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
if '.json' in file and ((filename != '' and file == filename) or filename == ''):
f = open(os.path.join(config['coordinates_path'], file), "r")
content = f.read()
try:
bb = KibanaGeoBoundingBox(content)
coordinates_tuple = bb.get_coordinates_tuple()
ScrapperHTML.scrap_range_of_coordinates(coordinates_tuple[0], coordinates_tuple[1], coordinates_tuple[2], coordinates_tuple[3])
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
@staticmethod
def scrap_range_of_coordinates(long_min, long_max, lat_min, lat_max):
for x in range(long_min, long_max):
def scrap_polygon(polygon):
bb = polygon.get_bounding_box()
lon_min = 0
lon_max = 0
lat_min = 0
lat_max = 0
for x in range(lon_min, lon_max):
for y in range(lat_min, lat_max):
x_scaled = x / config['scale']
@ -83,16 +76,17 @@ class ScrapperHTML(Scrapper):
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
sleep(300)
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
sleep(5)
sleep(config['sleep_time'])
@staticmethod
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max):
@ -123,13 +117,13 @@ class ScrapperHTML(Scrapper):
logger.error(e, exc_info=True)
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
sleep(300)
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
sleep(5)
sleep(config['sleep_time'])
if finished:
break
@ -164,13 +158,13 @@ class ScrapperHTML(Scrapper):
logger.error(e, exc_info=True)
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
sleep(300)
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
sleep(5)
sleep(config['sleep_time'])
if finished:
break
@ -202,19 +196,16 @@ class ScrapperHTML(Scrapper):
logger.error(e, exc_info=True)
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
sleep(300)
sleep(config['sleep_dos_time'])
except Exception as e:
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
sleep(5)
sleep(config['sleep_time'])
#ontology_converter = OntologyConverter()
#print(ontology_converter.cadastro_dict_to_ontology(results))
logger.debug("====PROCESSING FINISHED====")
logger.debug("Results found: {}".format(times))
#logger.debug(results)
return ListUtils.flat(results)
@staticmethod

View File

@ -1,10 +1,13 @@
import urllib.parse
from urllib import error
from time import sleep
import requests
import xmltodict as xmltodict
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
@ -21,11 +24,19 @@ class ScrapperXML:
pass
""" Scrapping main calls """
@classmethod
def scrap_all_addresses(cls):
def scrap_all_addresses(cls, prov_list):
"""Scraps properties by addresses. ONLY URBAN"""
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
for province in provinces:
prov_name = province['np']
if len(prov_list) > 0 and prov_name not in prov_list:
continue
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
for city in cities:
city_name = city['nm']
@ -38,53 +49,100 @@ class ScrapperXML:
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if 'lerr' in cadaster['consulta_numerero'] and \
'err' in cadaster['consulta_numerero']['lerr'] and \
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
try:
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
if 'lerr' in cadaster['consulta_numerero'] and \
'err' in cadaster['consulta_numerero']['lerr'] and \
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
num_scrapping_fails -= 1
else:
logger.debug("||||| ] FOUND!")
numps = cadaster['consulta_numerero']['numerero']['nump']
if not isinstance(numps, list):
numps = [numps]
for nump in numps:
num = nump['num']['pnp']
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2']
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name,
cadaster_num)
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
num_scrapping_fails = 10
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv,
nv, num)
if 'bico' in entry['consulta_dnp']:
# Parcela
cadaster_entry = CadasterEntryXML(entry, lon, lat)
cadaster_entry.to_elasticsearch()
elif 'lrcdnp' in entry['consulta_dnp']:
# Multiparcela
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name,
city_name,
cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat)
cadaster_entry.to_elasticsearch()
sleep(config['sleep_time'])
logger.debug("[|||||||||||] SUCCESS!")
sleep(config['sleep_time'])
except urllib.error.HTTPError as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("...sleeping...")
logger.error("=============================================")
''' Could be a service Unavailable or denegation of service'''
num_scrapping_fails -= 1
else:
logger.debug("||||| ] FOUND!")
sleep(config['sleep_dos_time'])
num = cadaster['consulta_numerero']['numerero']['nump']['num']['pnp']
cadaster_num = cadaster['consulta_numerero']['numerero']['nump']['pc']['pc1'] + \
cadaster['consulta_numerero']['numerero']['nump']['pc']['pc2']
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
''' Adding to tracking file'''
logger.info('{},{}'.format(lon, lat))
num_scrapping_fails = 10
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
cadaster_entry = CadasterEntryXML(entry, lon, lat)
cadaster_entry.to_elasticsearch()
logger.debug("[|||||||||||] SUCCESS!")
except Exception as e:
logger.error(
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
logger.error("=============================================")
logger.error(e, exc_info=True)
logger.error("=============================================")
num_scrapping_fails -= 1
counter += 1
sleep(5)
sleep(config['sleep_time'])
return
""" Scrapping secondary calls """
@classmethod
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
logger.debug("[|||||||| ] URL for coords: {} Params: {}".format(url, params))
logger.debug("[|||||||| ] URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_provinces(cls):
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
@ -92,6 +150,7 @@ class ScrapperXML:
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_cities(cls, provincia, municipio=None):
params = {'Provincia': provincia}
@ -104,9 +163,9 @@ class ScrapperXML:
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_addresses(cls, provincia, municipio, tipovia=None, nombrevia=None):
params = {'Provincia': provincia,
'Municipio': municipio}
if tipovia:
@ -123,6 +182,7 @@ class ScrapperXML:
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
params = {'Provincia': provincia,
@ -134,16 +194,17 @@ class ScrapperXML:
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaNumero")
logger.debug("====Dir: {} {} {} {} {}====".format(tipovia, nombrevia, numero, municipio, provincia))
logger.debug("[||| ] URL for address: {} Params: {}".format(url, params))
logger.debug("[||| ] URL for address: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None, planta=None,
puerta=None):
@classmethod
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
planta=None,
puerta=None):
params = {'Provincia': provincia,
'Municipio': municipio,
'Sigla': sigla,
@ -167,12 +228,13 @@ class ScrapperXML:
params['Puerta'] = ''
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
logger.debug("[|||||||||| ] URL for entry: {} Params: {}".format(url, params))
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
params = {"Provincia": provincia,
@ -180,10 +242,12 @@ class ScrapperXML:
"RC": rc}
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
response = requests.get(url, params=params)
xml = response.content
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
@classmethod
def Consulta_DNPPP(cls, provincia, municipio, poligono, parcela):
"""Proporciona los datos catastrales no protegidos de un inmueble
@ -207,6 +271,7 @@ class ScrapperXML:
response = requests.get(url, params=params)
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
@classmethod
def Consulta_DNPLOC_Codigos(cls, provincia, municipio, sigla, nombrevia, numero, bloque=None, escalera=None,
planta=None, puerta=None):
@ -257,6 +322,7 @@ class ScrapperXML:
response = requests.get(url, params=params)
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
@classmethod
def Consulta_DNPRC_Codigos(cls, provincia, municipio, rc):
"""Proporciona los datos catastrales de un inmueble,
@ -279,6 +345,7 @@ class ScrapperXML:
response = requests.get(url, params=params)
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
@classmethod
def Consulta_DNPPP_Codigos(cls, provincia, municipio, poligono, parcela):
"""Proporciona los datos catastrales de un inmueble.

View File

@ -9,5 +9,7 @@ config = {
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
"scale": 1000000,
"coordinates_path": os.path.join(root_path, 'coordinates'),
"not_available_via_XML": "(Not available via XML)"
"not_available_via_XML": "(Not available via XML)",
"sleep_time": 5,
"sleep_dos_time": 300
}

View File

@ -1,8 +1,11 @@
import os
import unittest
from shapely.geometry import Point
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.scrapper_html import ScrapperHTML
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
from src.librecatastro.domain.geometry.geo_bounding_box import GeoBoundingBox
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
@ -86,7 +89,7 @@ class ScrapperHTMLTests(unittest.TestCase):
self.assertIsNotNone(cadaster.from_elasticsearch())
def scrap_random_until_x_times_found(self, times):
coord = KibanaGeoBoundingBox.get_coordinate_tuple_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
coord = GeoBoundingBox.get_bb_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
cadaster_list = ScrapperHTML.scrap_results_random_x_times(times, coord[0], coord[1], coord[2], coord[3])
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
@ -106,5 +109,14 @@ class ScrapperHTMLTests(unittest.TestCase):
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_loading_point_is_in_polygon_returns_true(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertTrue(polygon.is_point_in_polygon(lon=-5.295410156250001, lat=40.069664523297774))
def test_loading_point_is_not_in_polygon_returns_false(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertFalse(polygon.is_point_in_polygon(lon=-1.9335937500000002, lat=48.31242790407178))
if __name__ == '__main__':
unittest.main()

View File

@ -4,6 +4,7 @@ from time import sleep
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.scrapper_xml import ScrapperXML
from src.settings import config
class ScrapperXMLTests(unittest.TestCase):
@ -145,6 +146,63 @@ class ScrapperXMLTests(unittest.TestCase):
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
return
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
prov_name = u'A CORUÑA'
city_name = u'A BAÑA'
tv = u'LG'
nv = u'ARZÓN'
num = 21
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_no_use_creates_entry_in_elasticsearch(self):
prov_name = u'A CORUÑA'
city_name = u'A BAÑA'
tv = u'LG'
nv = u'BARCALA'
num = 5
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
prov_name = u'A CORUÑA'
city_name = u'A BAÑA'
tv = u'RU'
nv = u'CASTELAO'
num = 1
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
cadaster = site['rc']['pc1'] + \
site['rc']['pc2'] + \
site['rc']['car'] + \
site['rc']['cc1'] + \
site['rc']['cc2']
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
if __name__ == '__main__':
unittest.main()