CHhnges coordinates system, now uses Kibana Geo Point json format. Different regions provided to avoid a big suboptimized square with lots of sea points

This commit is contained in:
J 2019-09-16 21:22:59 +02:00
parent c29de7faf2
commit 89b3cb5994
13 changed files with 166 additions and 35 deletions

View File

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 38.631890929028370,
"lon": 4.361572265625001
},
"top_left": {
"lat": 40.101185062587010,
"lon": 1.208496093750000
}
}
}
}

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 27.615406013399590,
"lon":-13.403320312500002
},
"top_left": {
"lat": 29.458731185355344,
"lon":-18.160400390625004
}
}
}
}

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.995785386420344,
"lon": 1.098632812500000
},
"top_left": {
"lat": 41.162113939396920,
"lon": -7.602539062500001
}
}
}
}

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.869994909901720,
"lon": -5.275497436523438
},
"top_left": {
"lat": 35.922281333698294,
"lon": -5.383987426757813
}
}
}
}

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 35.264683153268145,
"lon": -2.927513122558594
},
"top_left": {
"lat": 35.321008047212080,
"lon": -2.972831726074218
}
}
}
}

View File

@ -0,0 +1,15 @@
{
"geo_bounding_box": {
"ignore_unmapped": true,
"location": {
"bottom_right": {
"lat": 41.013065787006300,
"lon": 3.669433593750000
},
"top_left": {
"lat": 43.755225053069280,
"lon": -9.316406250000002
}
}
}
}

View File

@ -1,3 +1,4 @@
import os
import random
import re
import time
@ -10,24 +11,12 @@ from xml.etree import ElementTree
from bs4 import BeautifulSoup
from src.librecatastro.domain.cadaster_entry import CadasterEntry
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.list_utils import ListUtils
"""Constants"""
'''Spain geocoordinates'''
LONGITUDE = (42896, -180243) # *1000000
LATITUDE = (437692, 277255) # *1000000
'''Scale for scrapping'''
SCALE = 10000
'''Enumerator for tuple access'''
MAX = 0
MIN = 1
'''Catastro web services parametrized'''
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
@ -49,11 +38,25 @@ class CadastroScrapper:
""" Scrapping main calls """
@staticmethod
def scrap_all():
for x in range(LONGITUDE[MIN], LONGITUDE[MAX]):
for y in range(LATITUDE[MIN], LATITUDE[MAX]):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
if '.json' in file:
f = open(os.path.join(config['coordinates_path'], file), "r")
content = f.read()
try:
bb = KibanaGeoBoundingBox(content)
coordinates_tuple = bb.get_coordinates_tuple()
CadastroScrapper.scrap_range_of_coordinates(coordinates_tuple[0], coordinates_tuple[1], coordinates_tuple[2], coordinates_tuple[3])
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
x_scaled = x / SCALE
y_scaled = y / SCALE
@staticmethod
def scrap_range_of_coordinates(long_min, long_max, lat_min, lat_max):
for x in range(long_min, long_max):
for y in range(lat_min, lat_max):
x_scaled = x / config['scale']
y_scaled = y / config['scale']
''' Adding to tracking file'''
logger.info('{},{}'.format(x_scaled, y_scaled))
@ -81,16 +84,16 @@ class CadastroScrapper:
sleep(5)
@staticmethod
def scrap_results_by_time(seconds):
def scrap_results_by_time(seconds, lon_min, lon_max, lat_min, lat_max):
start_time = time.time()
results = []
finished = False
for x in range(LONGITUDE[MIN], LONGITUDE[MAX]):
for y in range(LATITUDE[MIN], LATITUDE[MAX]):
for x in range(lon_min, lon_max):
for y in range(lat_min, lat_max):
x_scaled = x / SCALE
y_scaled = y / SCALE
x_scaled = x / config['scale']
y_scaled = y / config['scale']
try:
result = CadastroScrapper.scrap_coord(x_scaled, y_scaled)
@ -122,16 +125,16 @@ class CadastroScrapper:
return ListUtils.flat(results)
@staticmethod
def scrap_results_linear_x_times(times):
def scrap_results_linear_x_times(times, lon_min, lon_max, lat_min, lat_max):
results = []
counter = times
finished = False
for x in range(LONGITUDE[MIN], LONGITUDE[MAX]):
for y in range(LATITUDE[MIN], LATITUDE[MAX]):
for x in range(lon_min, lon_max):
for y in range(lat_min, lat_max):
x_scaled = x / SCALE
y_scaled = y / SCALE
x_scaled = x / config['scale']
y_scaled = y / config['scale']
try:
@ -164,15 +167,15 @@ class CadastroScrapper:
return ListUtils.flat(results)
@staticmethod
def scrap_results_random_x_times(times):
def scrap_results_random_x_times(times, lon_min, lon_max, lat_min, lat_max):
results = []
counter = times
while counter > 0:
x = random.randrange(LONGITUDE[MIN], LONGITUDE[MAX])
y = random.randrange(LATITUDE[MIN], LATITUDE[MAX])
x = random.randrange(lon_min, lon_max)
y = random.randrange(lat_min, lat_max)
x_scaled = x / SCALE
y_scaled = y / SCALE
x_scaled = x / config['scale']
y_scaled = y / config['scale']
try:
cadaster_entry = CadastroScrapper.scrap_coord(x_scaled, y_scaled)

View File

@ -7,7 +7,7 @@ from src.librecatastro.domain.address import Address
from src.librecatastro.domain.location import Location
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.json_enconder import JSONEncoder
from src.utils.json_encoder import JSONEncoder
logger = CadastroLogger(__name__).logger

View File

@ -0,0 +1,32 @@
import json
from collections import namedtuple
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class KibanaGeoBoundingBox:
def __init__(self, data):
self.data = json.loads(data, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
def get_coordinates_tuple(self):
return KibanaGeoBoundingBox.get_coordinates_tuple_static(self.data)
@staticmethod
def get_coordinates_tuple_static(data):
location = data.geo_bounding_box.location
return int(location.top_left.lon * config['scale']), int(location.bottom_right.lon * config['scale']), int(location.bottom_right.lat * config['scale']), int(location.top_left.lat * config['scale'])
@staticmethod
def get_coordinate_tuple_from_file(file):
f = open(file, "r")
content = f.read()
try:
data = json.loads(content, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))
return KibanaGeoBoundingBox.get_coordinates_tuple_static(data)
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
return None

View File

@ -6,5 +6,7 @@ config = {
"separator": "####",
"elasticsearch-index": "cadaster",
"error_log_file": os.path.join(root_path, 'logs', 'log'),
"tracking_log_file": os.path.join(root_path, 'logs', 'track')
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
"scale": 1000000,
"coordinates_path": os.path.join(root_path, 'coordinates')
}

View File

@ -1,6 +1,9 @@
import os
import unittest
from src.librecatastro.catastro_scrapper import CadastroScrapper
from src.librecatastro.domain.kibana_geo_bounding_box import KibanaGeoBoundingBox
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
@ -77,7 +80,8 @@ class MyTestCase(unittest.TestCase):
self.assertIsNotNone(cadaster.from_elasticsearch())
def scrap_random_until_x_times_found(self, times):
cadaster_list = CadastroScrapper.scrap_results_random_x_times(times)
coord = KibanaGeoBoundingBox.get_coordinate_tuple_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
cadaster_list = CadastroScrapper.scrap_results_random_x_times(times, coord[0], coord[1], coord[2], coord[3])
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list