mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-07-06 15:22:28 +02:00
Fixes XML scrapping for processing optional arguments. Removes bounding boxes to be eventually changed to polygons. Adds parameters to process by province.
This commit is contained in:
parent
0478146b27
commit
9f7d5fda51
@ -16,6 +16,6 @@ services:
|
||||
ports:
|
||||
- "9200:9200"
|
||||
kibana:
|
||||
image: docker.elastic.co/kibana/kibana:6.3.2
|
||||
image: docker.elastic.co/geometry/geometry:6.3.2
|
||||
ports:
|
||||
- "5601:5601"
|
8
main.py
8
main.py
@ -7,11 +7,11 @@ from src.librecatastro.scrapping.scrapper_xml import ScrapperXML
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Runs the Cadastro Parser')
|
||||
parser.add_argument('--coords', dest='coords', default=False, action='store_true')
|
||||
parser.add_argument('--filename', nargs=1, dest='filename', default='')
|
||||
parser.add_argument('--filenames', action='store', nargs='+', dest='filenames', default=[])
|
||||
parser.add_argument('--provinces', action='store', nargs='+', dest='provinces', default=[])
|
||||
|
||||
args = parser.parse_args(sys.argv[1:])
|
||||
|
||||
if args.coords:
|
||||
ScrapperHTML.scrap_all_coordinates_files(args['filename'])
|
||||
ScrapperHTML.scrap_all_coordinates_files(args.filenames)
|
||||
else:
|
||||
ScrapperXML.scrap_all_addresses()
|
||||
ScrapperXML.scrap_all_addresses(args.provinces)
|
||||
|
@ -1,3 +1,4 @@
|
||||
shapely
|
||||
beautifulsoup4==4.8.0
|
||||
elasticsearch>=6.0.0,<7.0.0
|
||||
requests==2.22.0
|
||||
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 36.686041276581925,
|
||||
"lon": -2.0214843750000004
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 38.324420427006544,
|
||||
"lon": -7.514648437500001
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 35.995785386420344,
|
||||
"lon": -2.0434570312500004
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 37.37015718405753,
|
||||
"lon": -6.833496093750001
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 38.631890929028370,
|
||||
"lon": 4.361572265625001
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 40.101185062587010,
|
||||
"lon": 1.208496093750000
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 27.615406013399590,
|
||||
"lon":-13.403320312500002
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 29.458731185355344,
|
||||
"lon":-18.160400390625004
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 37.57941251343841,
|
||||
"lon": -0.9008789062500001
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 41.983994270935625,
|
||||
"lon": -6.218261718750001
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 35.869994909901720,
|
||||
"lon": -5.275497436523438
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 35.922281333698294,
|
||||
"lon": -5.383987426757813
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 27.638523614271946,
|
||||
"lon": -17.880249023437504
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 27.847576211806295,
|
||||
"lon": -18.157653808593754
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 27.72486719795934,
|
||||
"lon": -15.353393554687502
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 28.173717624327864,
|
||||
"lon": -15.839538574218752
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 35.264683153268145,
|
||||
"lon": -2.927513122558594
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 35.321008047212080,
|
||||
"lon": -2.972831726074218
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 37.56199695314352,
|
||||
"lon": -0.81298828125
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 38.87392853923632,
|
||||
"lon": -2.0214843750000004
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 42.081916678306335,
|
||||
"lon": -1.7358398437500002
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 43.27720532212024,
|
||||
"lon": -8.679199218750002
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 41.705728515237524,
|
||||
"lon": 3.1201171875
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 42.45588764197166,
|
||||
"lon": -2.7685546875000004
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 39.257778150283364,
|
||||
"lon": 3.4881591796875004
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 39.96870074491696,
|
||||
"lon": 2.3098754882812504
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 37.17782559332976,
|
||||
"lon": -6.04248046875
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 42.27730877423709,
|
||||
"lon": -8.811035156250002
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"geo_bounding_box": {
|
||||
"ignore_unmapped": true,
|
||||
"location": {
|
||||
"bottom_right": {
|
||||
"lat": 37.57941251343841,
|
||||
"lon": 0.21972656250000003
|
||||
},
|
||||
"top_left": {
|
||||
"lat": 41.32732632036624,
|
||||
"lon": -1.2304687500000002
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -14,25 +14,33 @@ logger = CadastroLogger(__name__).logger
|
||||
class CadasterEntryXML(CadasterEntry):
|
||||
"""Cadaster class, that stores all the information about a surface and its properties"""
|
||||
|
||||
def __init__(self, xml, lon, lat):
|
||||
def __init__(self, xml, lon, lat, is_property=True):
|
||||
|
||||
self.address = Address(xml['consulta_dnp']['bico']['bi']['ldt'])
|
||||
|
||||
self.cadaster = xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc1'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2']
|
||||
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant']
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['pc2'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['car'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc1'] + \
|
||||
xml['consulta_dnp']['bico']['bi']['idbi']['rc']['cc2']
|
||||
|
||||
type = xml['consulta_dnp']['bico']['bi']['idbi']['cn']
|
||||
self.type = u'Urbano' if type == 'UR' else u'Rústico'
|
||||
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso']
|
||||
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2'
|
||||
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant']
|
||||
self.year = xml['consulta_dnp']['bico']['bi']['debi']['ant'] \
|
||||
if 'debi' in xml['consulta_dnp']['bico']['bi'] and\
|
||||
'ant' in xml['consulta_dnp']['bico']['bi']['debi'] else None
|
||||
|
||||
self.type = xml['consulta_dnp']['bico']['bi']['idbi']['cn'] if 'cn' in xml['consulta_dnp']['bico']['bi']['idbi'] else None
|
||||
if self.type is not None:
|
||||
self.type = u'Urbano' if self.type == 'UR' else u'Rústico'
|
||||
|
||||
self.use = xml['consulta_dnp']['bico']['bi']['debi']['luso'] if 'luso' in xml['consulta_dnp']['bico']['bi']['debi'] else None
|
||||
self.surface = xml['consulta_dnp']['bico']['bi']['debi']['sfc'] + 'm2' if 'sfc' in xml['consulta_dnp']['bico']['bi']['debi'] else None
|
||||
self.location = Location(lon, lat)
|
||||
self.gsurface = config['not_available_via_XML']
|
||||
self.constructions = []
|
||||
constructions = xml['consulta_dnp']['bico']['lcons']['cons']
|
||||
|
||||
constructions = []
|
||||
if 'lcons' in xml['consulta_dnp']['bico']:
|
||||
constructions = xml['consulta_dnp']['bico']['lcons']['cons']
|
||||
|
||||
''' Bad XML design, instead of returning a list with 1 element, it returns
|
||||
the element'''
|
||||
@ -40,20 +48,17 @@ class CadasterEntryXML(CadasterEntry):
|
||||
constructions = [constructions]
|
||||
|
||||
for construction in constructions:
|
||||
use = construction['lcd']
|
||||
doorway = construction['dt']['lourb']['loint']['es']
|
||||
floor = construction['dt']['lourb']['loint']['pt']
|
||||
door = construction['dt']['lourb']['loint']['pt']
|
||||
surface = construction['dfcons']['stl']
|
||||
use = construction['lcd'] if 'lcd' in construction else None
|
||||
doorway = construction['dt']['lourb']['loint']['es'] if 'dt' in construction else None
|
||||
floor = construction['dt']['lourb']['loint']['pt'] if 'dt' in construction else None
|
||||
door = construction['dt']['lourb']['loint']['pu'] if 'dt' in construction else None
|
||||
surface = construction['dfcons']['stl'] if 'dfcons' in construction and 'stl' in construction['dfcons'] else None
|
||||
reform_type = config['not_available_via_XML']
|
||||
reform_date = config['not_available_via_XML']
|
||||
|
||||
self.constructions.append(Construction(dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type, fecha=reform_date)))
|
||||
self.constructions.append(Construction(
|
||||
dict(uso=use, escalera=doorway, planta=floor, puerta=door, superficie=surface, tipo=reform_type,
|
||||
fecha=reform_date)))
|
||||
|
||||
self.timestamp = str(datetime.now())
|
||||
super().__init__(self)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
0
src/librecatastro/domain/geometry/__init__.py
Normal file
0
src/librecatastro/domain/geometry/__init__.py
Normal file
@ -8,25 +8,25 @@ from src.utils.cadastro_logger import CadastroLogger
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class KibanaGeoBoundingBox:
|
||||
class GeoBoundingBox:
|
||||
def __init__(self, data):
|
||||
self.data = json.loads(data, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
|
||||
|
||||
def get_coordinates_tuple(self):
|
||||
return KibanaGeoBoundingBox.get_coordinates_tuple_static(self.data)
|
||||
return GeoBoundingBox.get_bb_from_file_static(self.data)
|
||||
|
||||
@staticmethod
|
||||
def get_coordinates_tuple_static(data):
|
||||
def get_bb_from_file_static(data):
|
||||
location = data.geo_bounding_box.location
|
||||
return int(location.top_left.lon * config['scale']), int(location.bottom_right.lon * config['scale']), int(location.bottom_right.lat * config['scale']), int(location.top_left.lat * config['scale'])
|
||||
|
||||
@staticmethod
|
||||
def get_coordinate_tuple_from_file(file):
|
||||
def get_bb_from_file(file):
|
||||
f = open(file, "r")
|
||||
content = f.read()
|
||||
try:
|
||||
data = json.loads(content, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
|
||||
return KibanaGeoBoundingBox.get_coordinates_tuple_static(data)
|
||||
return GeoBoundingBox.get_bb_from_file_static(data)
|
||||
except:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
return None
|
34
src/librecatastro/domain/geometry/geo_polygon.py
Normal file
34
src/librecatastro/domain/geometry/geo_polygon.py
Normal file
@ -0,0 +1,34 @@
|
||||
import json
|
||||
from collections import namedtuple
|
||||
|
||||
from shapely.geometry import Point, Polygon
|
||||
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
'''Logger'''
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
||||
|
||||
class GeoPolygon:
|
||||
|
||||
def __init__(self, file):
|
||||
self.polygon = None
|
||||
try:
|
||||
with open(file, "r") as f:
|
||||
content = f.read()
|
||||
|
||||
data = json.loads(content, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
|
||||
points = data.geo_polygon.location.points
|
||||
points_list = []
|
||||
for point in points:
|
||||
points_list.append((point.lon, point.lat))
|
||||
self.polygon = Polygon(points_list)
|
||||
except Exception as e:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
def is_point_in_polygon(self, lon, lat):
|
||||
p = Point(lon, lat)
|
||||
return self.polygon.contains(p)
|
||||
|
||||
def get_bounding_box(self):
|
||||
pass
|
@ -1,10 +1,3 @@
|
||||
import re
|
||||
from urllib.request import urlopen
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
'''Logger'''
|
||||
|
@ -11,7 +11,8 @@ from xml.etree import ElementTree
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
|
||||
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
|
||||
from src.librecatastro.domain.geometry.geo_bounding_box import GeoBoundingBox
|
||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||
from src.librecatastro.scrapping.scrapper import Scrapper
|
||||
from src.settings import config
|
||||
|
||||
@ -27,43 +28,35 @@ class ScrapperHTML(Scrapper):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
pass
|
||||
|
||||
""" Scrapping main calls """
|
||||
|
||||
@staticmethod
|
||||
def scrap_all_coordinates_files(filename=''):
|
||||
@classmethod
|
||||
def scrap_all_coordinates_files(cls, filenames):
|
||||
|
||||
for r, d, files in os.walk(config['coordinates_path']):
|
||||
for file in files:
|
||||
if '.json' in file and ((filename != '' and file == filename) or filename == ''):
|
||||
f = open(os.path.join(config['coordinates_path'], file), "r")
|
||||
content = f.read()
|
||||
try:
|
||||
bb = KibanaGeoBoundingBox(content)
|
||||
coordinates_tuple = bb.get_coordinates_tuple()
|
||||
ScrapperHTML.scrap_range_of_coordinates(coordinates_tuple[0], coordinates_tuple[1],
|
||||
coordinates_tuple[2], coordinates_tuple[3])
|
||||
except:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
if len(filenames) > 0 and file not in filenames:
|
||||
continue
|
||||
|
||||
if '.json' not in file:
|
||||
continue
|
||||
|
||||
try:
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
|
||||
ScrapperHTML.scrap_polygon(polygon)
|
||||
except:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
@staticmethod
|
||||
def scrap_all_coordinates_files(filename=''):
|
||||
for r, d, files in os.walk(config['coordinates_path']):
|
||||
for file in files:
|
||||
if '.json' in file and ((filename != '' and file == filename) or filename == ''):
|
||||
f = open(os.path.join(config['coordinates_path'], file), "r")
|
||||
content = f.read()
|
||||
try:
|
||||
bb = KibanaGeoBoundingBox(content)
|
||||
coordinates_tuple = bb.get_coordinates_tuple()
|
||||
ScrapperHTML.scrap_range_of_coordinates(coordinates_tuple[0], coordinates_tuple[1], coordinates_tuple[2], coordinates_tuple[3])
|
||||
except:
|
||||
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
|
||||
|
||||
@staticmethod
|
||||
def scrap_range_of_coordinates(long_min, long_max, lat_min, lat_max):
|
||||
for x in range(long_min, long_max):
|
||||
def scrap_polygon(polygon):
|
||||
bb = polygon.get_bounding_box()
|
||||
lon_min = 0
|
||||
lon_max = 0
|
||||
lat_min = 0
|
||||
lat_max = 0
|
||||
for x in range(lon_min, lon_max):
|
||||
for y in range(lat_min, lat_max):
|
||||
|
||||
x_scaled = x / config['scale']
|
||||
@ -83,16 +76,17 @@ class ScrapperHTML(Scrapper):
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("...sleeping...")
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(300)
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
|
||||
sleep(5)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max):
|
||||
@ -123,13 +117,13 @@ class ScrapperHTML(Scrapper):
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(300)
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(5)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
if finished:
|
||||
break
|
||||
@ -164,13 +158,13 @@ class ScrapperHTML(Scrapper):
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(300)
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(5)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
if finished:
|
||||
break
|
||||
@ -202,19 +196,16 @@ class ScrapperHTML(Scrapper):
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
sleep(300)
|
||||
sleep(config['sleep_dos_time'])
|
||||
except Exception as e:
|
||||
logger.error("ERROR AT LONGITUDE {} LATITUDE {}".format(x_scaled, y_scaled))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
sleep(5)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
#ontology_converter = OntologyConverter()
|
||||
#print(ontology_converter.cadastro_dict_to_ontology(results))
|
||||
logger.debug("====PROCESSING FINISHED====")
|
||||
logger.debug("Results found: {}".format(times))
|
||||
#logger.debug(results)
|
||||
return ListUtils.flat(results)
|
||||
|
||||
@staticmethod
|
||||
|
@ -1,10 +1,13 @@
|
||||
import urllib.parse
|
||||
from urllib import error
|
||||
|
||||
from time import sleep
|
||||
|
||||
import requests
|
||||
import xmltodict as xmltodict
|
||||
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
'''Logger'''
|
||||
@ -21,11 +24,19 @@ class ScrapperXML:
|
||||
pass
|
||||
|
||||
""" Scrapping main calls """
|
||||
|
||||
@classmethod
|
||||
def scrap_all_addresses(cls):
|
||||
def scrap_all_addresses(cls, prov_list):
|
||||
"""Scraps properties by addresses. ONLY URBAN"""
|
||||
|
||||
provinces = ScrapperXML.get_provinces()['consulta_provinciero']['provinciero']['prov']
|
||||
|
||||
for province in provinces:
|
||||
prov_name = province['np']
|
||||
|
||||
if len(prov_list) > 0 and prov_name not in prov_list:
|
||||
continue
|
||||
|
||||
cities = ScrapperXML.get_cities(prov_name)['consulta_municipiero']['municipiero']['muni']
|
||||
for city in cities:
|
||||
city_name = city['nm']
|
||||
@ -38,53 +49,100 @@ class ScrapperXML:
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
|
||||
while num_scrapping_fails > 0:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
if 'lerr' in cadaster['consulta_numerero'] and \
|
||||
'err' in cadaster['consulta_numerero']['lerr'] and \
|
||||
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
|
||||
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
|
||||
try:
|
||||
cadaster = ScrapperXML.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
|
||||
if 'lerr' in cadaster['consulta_numerero'] and \
|
||||
'err' in cadaster['consulta_numerero']['lerr'] and \
|
||||
'cod' in cadaster['consulta_numerero']['lerr']['err'] and \
|
||||
cadaster['consulta_numerero']['lerr']['err']['cod'] == '43':
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
logger.debug("||||| ] FOUND!")
|
||||
|
||||
numps = cadaster['consulta_numerero']['numerero']['nump']
|
||||
|
||||
if not isinstance(numps, list):
|
||||
numps = [numps]
|
||||
|
||||
for nump in numps:
|
||||
num = nump['num']['pnp']
|
||||
cadaster_num = nump['pc']['pc1'] + nump['pc']['pc2']
|
||||
|
||||
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name,
|
||||
cadaster_num)
|
||||
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
|
||||
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
|
||||
|
||||
''' Adding to tracking file'''
|
||||
logger.info('{},{}'.format(lon, lat))
|
||||
|
||||
num_scrapping_fails = 10
|
||||
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv,
|
||||
nv, num)
|
||||
|
||||
if 'bico' in entry['consulta_dnp']:
|
||||
# Parcela
|
||||
cadaster_entry = CadasterEntryXML(entry, lon, lat)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
elif 'lrcdnp' in entry['consulta_dnp']:
|
||||
# Multiparcela
|
||||
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
|
||||
cadaster = site['rc']['pc1'] + \
|
||||
site['rc']['pc2'] + \
|
||||
site['rc']['car'] + \
|
||||
site['rc']['cc1'] + \
|
||||
site['rc']['cc2']
|
||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name,
|
||||
city_name,
|
||||
cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
logger.debug("[|||||||||||] SUCCESS!")
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
logger.error(
|
||||
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("...sleeping...")
|
||||
logger.error("=============================================")
|
||||
''' Could be a service Unavailable or denegation of service'''
|
||||
num_scrapping_fails -= 1
|
||||
else:
|
||||
logger.debug("||||| ] FOUND!")
|
||||
sleep(config['sleep_dos_time'])
|
||||
|
||||
num = cadaster['consulta_numerero']['numerero']['nump']['num']['pnp']
|
||||
cadaster_num = cadaster['consulta_numerero']['numerero']['nump']['pc']['pc1'] + \
|
||||
cadaster['consulta_numerero']['numerero']['nump']['pc']['pc2']
|
||||
|
||||
coords = ScrapperXML.get_coords_from_cadaster(prov_name, city_name, cadaster_num)
|
||||
lon = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['xcen']
|
||||
lat = coords['consulta_coordenadas']['coordenadas']['coord']['geo']['ycen']
|
||||
|
||||
''' Adding to tracking file'''
|
||||
logger.info('{},{}'.format(lon, lat))
|
||||
|
||||
num_scrapping_fails = 10
|
||||
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
cadaster_entry = CadasterEntryXML(entry, lon, lat)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
logger.debug("[|||||||||||] SUCCESS!")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"ERROR AT ADDRESS {} {} {} {} {}".format(tv, nv, num, prov_name, city_name))
|
||||
logger.error("=============================================")
|
||||
logger.error(e, exc_info=True)
|
||||
logger.error("=============================================")
|
||||
num_scrapping_fails -= 1
|
||||
|
||||
counter += 1
|
||||
sleep(5)
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
|
||||
return
|
||||
|
||||
""" Scrapping secondary calls """
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_coords_from_cadaster(cls, provincia, municipio, cadaster):
|
||||
params = {'Provincia': provincia, 'Municipio': municipio, 'SRS': 'EPSG:4230', 'RC': cadaster}
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_CPMRC")
|
||||
|
||||
logger.debug("[|||||||| ] URL for coords: {} Params: {}".format(url, params))
|
||||
logger.debug("[|||||||| ] URL for coords: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_provinces(cls):
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaProvincia")
|
||||
@ -92,6 +150,7 @@ class ScrapperXML:
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_cities(cls, provincia, municipio=None):
|
||||
params = {'Provincia': provincia}
|
||||
@ -104,9 +163,9 @@ class ScrapperXML:
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_addresses(cls, provincia, municipio, tipovia=None, nombrevia=None):
|
||||
|
||||
params = {'Provincia': provincia,
|
||||
'Municipio': municipio}
|
||||
if tipovia:
|
||||
@ -123,6 +182,7 @@ class ScrapperXML:
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_by_address(cls, provincia, municipio, tipovia, nombrevia, numero):
|
||||
params = {'Provincia': provincia,
|
||||
@ -134,16 +194,17 @@ class ScrapperXML:
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/ConsultaNumero")
|
||||
|
||||
logger.debug("====Dir: {} {} {} {} {}====".format(tipovia, nombrevia, numero, municipio, provincia))
|
||||
logger.debug("[||| ] URL for address: {} Params: {}".format(url, params))
|
||||
logger.debug("[||| ] URL for address: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None, planta=None,
|
||||
puerta=None):
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_entries_by_address(cls, provincia, municipio, sigla, calle, numero, bloque=None, escalera=None,
|
||||
planta=None,
|
||||
puerta=None):
|
||||
params = {'Provincia': provincia,
|
||||
'Municipio': municipio,
|
||||
'Sigla': sigla,
|
||||
@ -167,12 +228,13 @@ class ScrapperXML:
|
||||
params['Puerta'] = ''
|
||||
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPLOC")
|
||||
logger.debug("[|||||||||| ] URL for entry: {} Params: {}".format(url, params))
|
||||
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_cadaster_entries_by_cadaster(cls, provincia, municipio, rc):
|
||||
params = {"Provincia": provincia,
|
||||
@ -180,10 +242,12 @@ class ScrapperXML:
|
||||
"RC": rc}
|
||||
|
||||
url = cls.URL_LOCATIONS_BASE.format("/OVCCallejero.asmx/Consulta_DNPRC")
|
||||
logger.debug("[|||||||||| ] URL for entry: {}".format(url + '?' + urllib.parse.urlencode(params)))
|
||||
response = requests.get(url, params=params)
|
||||
xml = response.content
|
||||
return xmltodict.parse(xml, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def Consulta_DNPPP(cls, provincia, municipio, poligono, parcela):
|
||||
"""Proporciona los datos catastrales no protegidos de un inmueble
|
||||
@ -207,6 +271,7 @@ class ScrapperXML:
|
||||
response = requests.get(url, params=params)
|
||||
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def Consulta_DNPLOC_Codigos(cls, provincia, municipio, sigla, nombrevia, numero, bloque=None, escalera=None,
|
||||
planta=None, puerta=None):
|
||||
@ -257,6 +322,7 @@ class ScrapperXML:
|
||||
response = requests.get(url, params=params)
|
||||
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def Consulta_DNPRC_Codigos(cls, provincia, municipio, rc):
|
||||
"""Proporciona los datos catastrales de un inmueble,
|
||||
@ -279,6 +345,7 @@ class ScrapperXML:
|
||||
response = requests.get(url, params=params)
|
||||
return xmltodict.parse(response.content, process_namespaces=False, xml_attribs=False)
|
||||
|
||||
|
||||
@classmethod
|
||||
def Consulta_DNPPP_Codigos(cls, provincia, municipio, poligono, parcela):
|
||||
"""Proporciona los datos catastrales de un inmueble.
|
||||
|
@ -9,5 +9,7 @@ config = {
|
||||
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
|
||||
"scale": 1000000,
|
||||
"coordinates_path": os.path.join(root_path, 'coordinates'),
|
||||
"not_available_via_XML": "(Not available via XML)"
|
||||
"not_available_via_XML": "(Not available via XML)",
|
||||
"sleep_time": 5,
|
||||
"sleep_dos_time": 300
|
||||
}
|
||||
|
@ -1,8 +1,11 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from shapely.geometry import Point
|
||||
|
||||
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
|
||||
from src.librecatastro.scrapping.scrapper_html import ScrapperHTML
|
||||
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
|
||||
from src.librecatastro.domain.geometry.geo_bounding_box import GeoBoundingBox
|
||||
from src.settings import config
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
|
||||
@ -86,7 +89,7 @@ class ScrapperHTMLTests(unittest.TestCase):
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def scrap_random_until_x_times_found(self, times):
|
||||
coord = KibanaGeoBoundingBox.get_coordinate_tuple_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
|
||||
coord = GeoBoundingBox.get_bb_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
|
||||
cadaster_list = ScrapperHTML.scrap_results_random_x_times(times, coord[0], coord[1], coord[2], coord[3])
|
||||
self.assertTrue(len(cadaster_list) >= times)
|
||||
return cadaster_list
|
||||
@ -106,5 +109,14 @@ class ScrapperHTMLTests(unittest.TestCase):
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_loading_point_is_in_polygon_returns_true(self):
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||
self.assertTrue(polygon.is_point_in_polygon(lon=-5.295410156250001, lat=40.069664523297774))
|
||||
|
||||
def test_loading_point_is_not_in_polygon_returns_false(self):
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||
self.assertFalse(polygon.is_point_in_polygon(lon=-1.9335937500000002, lat=48.31242790407178))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -4,6 +4,7 @@ from time import sleep
|
||||
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
|
||||
from src.librecatastro.scrapping.scrapper_xml import ScrapperXML
|
||||
from src.settings import config
|
||||
|
||||
|
||||
class ScrapperXMLTests(unittest.TestCase):
|
||||
@ -145,6 +146,63 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
return
|
||||
|
||||
def test_multiparcela_creates_n_entries_in_elasticsearch(self):
|
||||
prov_name = u'A CORUÑA'
|
||||
city_name = u'A BAÑA'
|
||||
tv = u'LG'
|
||||
nv = u'ARZÓN'
|
||||
num = 21
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
|
||||
cadaster = site['rc']['pc1'] + \
|
||||
site['rc']['pc2'] + \
|
||||
site['rc']['car'] + \
|
||||
site['rc']['cc1'] + \
|
||||
site['rc']['cc2']
|
||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_no_use_creates_entry_in_elasticsearch(self):
|
||||
prov_name = u'A CORUÑA'
|
||||
city_name = u'A BAÑA'
|
||||
tv = u'LG'
|
||||
nv = u'BARCALA'
|
||||
num = 5
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
|
||||
cadaster = site['rc']['pc1'] + \
|
||||
site['rc']['pc2'] + \
|
||||
site['rc']['car'] + \
|
||||
site['rc']['cc1'] + \
|
||||
site['rc']['cc2']
|
||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
|
||||
prov_name = u'A CORUÑA'
|
||||
city_name = u'A BAÑA'
|
||||
tv = u'RU'
|
||||
nv = u'CASTELAO'
|
||||
num = 1
|
||||
entry = ScrapperXML.get_cadaster_entries_by_address(prov_name, city_name, tv, nv, num)
|
||||
for site in entry['consulta_dnp']['lrcdnp']['rcdnp']:
|
||||
cadaster = site['rc']['pc1'] + \
|
||||
site['rc']['pc2'] + \
|
||||
site['rc']['car'] + \
|
||||
site['rc']['cc1'] + \
|
||||
site['rc']['cc2']
|
||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry, None, None)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
Loading…
Reference in New Issue
Block a user