Refactoring of tests. Added health check and some minor changes.

This commit is contained in:
J 2019-09-22 14:48:41 +02:00
parent 4cb916b67b
commit 2606fc95f0
16 changed files with 260 additions and 158 deletions

View File

@ -4,14 +4,15 @@
import sys
import argparse
from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML, ParserHTML
from src.librecatastro.scrapping.parsers.parser_html import ParserHTML
from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.librecatastro.scrapping.searchers.provinces_searcher import ProvincesSearcher
from src.settings import config
from src.tests.servers_health.server_health_tests import ServerHealthTests
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Runs the Cadastro Parser')
parser = argparse.ArgumentParser(description='Runs libreCadastro')
parser.add_argument('--coords', action='store_true', dest='coords', default=False)
parser.add_argument('--filenames', action='store', nargs='+', dest='filenames', default=[])
parser.add_argument('--provinces', action='store', nargs='+', dest='provinces', default=[])
@ -22,6 +23,7 @@ if __name__ == "__main__":
parser.add_argument('--startcity', action='store', dest='startcity', default='')
parser.add_argument('--listprovinces', action='store_true', dest='listprovinces', default=False)
parser.add_argument('--listcities', action='store', nargs=1, dest='listcities', default=[])
parser.add_argument('--health', action='store_true', dest='health', default=False)
args = parser.parse_args(sys.argv[1:])
@ -31,13 +33,6 @@ if __name__ == "__main__":
if args.scale:
config['scale'] = args.scale
parser = ParserHTML if args.html else ParserXML
filenames = args.filenames
pictures = args.pictures
provinces = args.provinces
startcity = args.startcity
if args.listprovinces:
ProvincesSearcher.list_provinces()
exit(0)
@ -46,6 +41,17 @@ if __name__ == "__main__":
ProvincesSearcher.list_cities(args.listcities[0])
exit(0)
if args.health:
ServerHealthTests.healthcheck()
exit(0)
parser = ParserHTML if args.html else ParserXML
filenames = args.filenames
pictures = args.pictures
provinces = args.provinces
startcity = args.startcity
if args.coords:
CoordinatesSearcher.search_by_coordinates(parser, filenames, pictures)
else:

View File

@ -45,8 +45,13 @@ class ParserHTML(Parser):
results = []
if pc1 is not None and pc2 is not None:
cadaster = ''.join([pc1.text, pc2.text])
htmls = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures)
for html, picture in htmls.items():
html_picture_tuples = ScrapperHTML.scrap_cadaster(cadaster, None, None, pictures)
if not isinstance(html_picture_tuples, list):
html_picture_tuples = [html_picture_tuples]
for html_picture_tuple in html_picture_tuples:
html, picture = html_picture_tuple
cadaster_entry = cls.parse_html_parcela(html, x, y, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
@ -54,7 +59,10 @@ class ParserHTML(Parser):
return results
@classmethod
def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
times = 0
results = []
num = ''
for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
@ -110,8 +118,13 @@ class ParserHTML(Parser):
for html, picture in htmls:
cadaster_entry = cls.parse_html_parcela(html, lon, lat, picture)
cadaster_entry.to_elasticsearch()
results.append(cadaster_entry)
counter += 1
times += 1
if max_times is not None and times >= max_times:
return results
except urllib.error.HTTPError as e:
logger.error(

View File

@ -15,6 +15,8 @@ from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap
from src.utils.list_utils import ListUtils
'''Logger'''
logger = CadastroLogger(__name__).logger
@ -124,7 +126,11 @@ class ParserXML(Parser):
return results
@classmethod
def process_search_by_provinces(cls, prov_list, pictures=False, start_from=''):
def process_search_by_provinces(cls, prov_list, pictures=False, start_from='', max_times=None):
times = 0
results = []
for prov_name, prov_num, city_name, city_num, address, tv, nv in Scrapper.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap():
continue
@ -139,6 +145,10 @@ class ParserXML(Parser):
num_scrapping_fails -= 1
else:
num_scrapping_fails = 10
times += 1
results.append(res)
if max_times is not None and times >= max_times:
return ListUtils.flat(results)
except urllib.error.HTTPError as e:
logger.error(
@ -160,6 +170,8 @@ class ParserXML(Parser):
counter += 1
return results
''' Parsing calls '''
@classmethod

View File

@ -15,7 +15,7 @@ logger = CadastroLogger(__name__).logger
class ScrapperHTML(Scrapper):
"""HTML Catastro Scrapper"""
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}"
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4326&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"

View File

@ -20,7 +20,7 @@ class ScrapperXML(Scrapper):
@classmethod
def get_coord(cls,x, y):
params = {'SRS': 'EPSG:4230', 'Coordenada_X': x, 'Coordenada_Y': y}
params = {'SRS': 'EPSG:4326', 'Coordenada_X': x, 'Coordenada_Y': y}
url = cls.URL_LOCATIONS_BASE.format("/OVCCoordenadas.asmx/Consulta_RCCOOR")
response = requests.get(url, params=params)

View File

@ -159,10 +159,11 @@ class CoordinatesSearcher(Searcher):
return ListUtils.flat(results)
@staticmethod
def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, scrapper):
def search_by_coordinates_random_max_n_matches(times, lon_min, lon_max, lat_min, lat_max, parser):
results = []
counter = times
while counter > 0:
x = random.randrange(lon_min, lon_max)
y = random.randrange(lat_min, lat_max)
@ -170,7 +171,7 @@ class CoordinatesSearcher(Searcher):
y_scaled = y / config['scale']
try:
cadaster_entry = scrapper.process_search_by_coordinates(x_scaled, y_scaled)
cadaster_entry = parser.process_search_by_coordinates(x_scaled, y_scaled)
if len(cadaster_entry) > 0:
results.append(cadaster_entry)

View File

@ -17,8 +17,13 @@ config = {
"sleep_dos_time": 300,
"width_px": 120,
"height_px": 120,
"servers_down_message": "Some of the Cadastro servers are down. "
"Maintenance is usually carried out durign the night or the weekends. Please, retry later."
"As an alternative, your IP address may have been banned. Try to change your public IP"
"servers_down_message_001": "Error 001: Cadastro server to get provinces and cities is down.\n"
"Consequence: Search by provinces will fail.\n"
"Maintenance is usually carried out durign the night or the weekends. Please, retry later.\n"
"As an alternative, your IP address may have been banned. Try to change your public IP",
"servers_down_message_002": "Error 002: Cadastro server to query by cadaster number is off.\n"
"Search by Coordinates will fail.\n"
"Maintenance is usually carried out durign the night or the weekends. Please, retry later.\n"
"As an alternative, your IP address may have been banned. Try to change your public IP\n"
}

View File

View File

@ -0,0 +1,49 @@
import unittest
from src.librecatastro.scrapping.parsers.parser_html import ParserHTML
from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
class ParserHTMLTests(unittest.TestCase):
def test_search_by_coordinates_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_search_site_lot_is_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
html, picture = cadaster_list[0]
cadaster = ParserHTML.parse_html_parcela(html)
self.assertIsNotNone(cadaster.address.site)
self.assertIsNotNone(cadaster.address.lot)
def test_search_constructions_are_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('5036901NH2553N0001HB')
html, picture = cadaster_list[0]
cadaster = ParserHTML.parse_html_parcela(html)
self.assertTrue(len(cadaster.constructions)>0)
def test_seach_no_cp_is_correctly_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('06145A00500028')
html, picture = cadaster_list[0]
cadaster = ParserHTML.parse_html_parcela(html)
self.assertIsNone(cadaster.address.cp)
def test_search_multiparcela_2_cadasters_are_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('22282A00900547')
for cadaster in cadaster_list:
html, picture = cadaster
cadaster = ParserHTML.parse_html_parcela(html)
self.assertIsNotNone(cadaster.cadaster)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,22 @@
import unittest
from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
class ParserXMLTests(unittest.TestCase):
def test_search_by_coordinates_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 1)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
if __name__ == '__main__':
unittest.main()

View File

@ -1,132 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import unittest
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML
from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
from src.librecatastro.scrapping.parsers.parser_html import ScrapperHTML, ParserHTML
class ScrapperHTMLTests(unittest.TestCase):
def test_remove_index_elasticsearch_works(self):
ElasticSearchUtils.remove_index()
assert True
def test_create_index_elasticsearch_works(self):
ElasticSearchUtils.create_index()
assert True
def test_coordinate_creates_cadaster(self):
cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
def test_coordinate_multiparcela_creates_cadaster(self):
cadaster_list = ScrapperHTML.parse_coord(-0.33, 39.47)
self.assertTrue(len(cadaster_list) > 1)
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
cadaster_list = ScrapperHTML.parse_coord(-3.68, 40.47)
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_cadaster_site_lot_creates_cadaster_and_sets_site_lot(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.site, '25')
self.assertEqual(cadaster.address.lot, '3')
def test_cadaster_full_creates_cadaster(self):
def test_cadaster_full_returns_html(self):
cadaster_list = ScrapperHTML.scrap_cadaster('0083101WK2008S0001PD')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
self.assertEqual(cadaster.address.province, 'CUENCA')
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNone(picture)
def test_cadaster_full_creates_cadaster_with_constructions(self):
def test_cadaster_full_with_picture_returns_html_and_picture(self):
cadaster_list = ScrapperHTML.scrap_cadaster('0083101WK2008S0001PD', pictures=True)
self.assertEqual(len(cadaster_list), 1)
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNotNone(picture)
def test_cadaster_half_site_lot_returns_html(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
self.assertEqual(len(cadaster_list), 1)
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNone(picture)
def test_cadaster_half_site_lot_returns_html_and_picture(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003', pictures=True)
self.assertEqual(len(cadaster_list), 1)
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNotNone(picture)
def test_cadaster_full_with_constructions_returns_html_and_picture(self):
cadaster_list = ScrapperHTML.scrap_cadaster('5036901NH2553N0001HB')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertTrue(len(cadaster.constructions) > 0)
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNone(picture)
def test_cadaster_half_creates_cadaster(self):
cadaster_list = ScrapperHTML.scrap_cadaster('0183001WK2008S')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
self.assertEqual(cadaster.address.province, 'CUENCA')
def test_cadaster_half_creates_cadaster_2(self):
cadaster_list = ScrapperHTML.scrap_cadaster('21012A03100046')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertEqual(cadaster.address.province, 'HUELVA')
def test_cadaster_no_cp_creates_cadaster(self):
def test_cadaster_no_cp_returns_html(self):
cadaster_list = ScrapperHTML.scrap_cadaster('06145A00500028')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
self.assertIsNone(cadaster.address.cp)
self.assertEqual(cadaster.address.province, 'BADAJOZ')
html, picture = cadaster_list[0]
self.assertIsNotNone(html)
self.assertIsNone(picture)
def test_cadaster_multiparcela_returns_list_of_2(self):
cadaster_list = ScrapperHTML.scrap_cadaster('22282A00900547')
self.assertEqual(len(cadaster_list), 2)
def test_cadaster_is_stored_in_elasticsearch(self):
cadaster_list = ScrapperHTML.scrap_cadaster('0183001WK2008S')
self.assertEqual(len(cadaster_list), 1)
cadaster = cadaster_list[0]
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def scrap_random_until_x_times_found(self, times):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box()
cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
def test_scrap_random_until_5_found(self):
self.scrap_random_until_x_times_found(5)
def test_scrap_random_until_5_is_stored_in_elasticsearch(self):
cadaster_list = self.scrap_random_until_x_times_found(5)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_scrap_random_until_1_is_stored_in_elasticsearch(self):
cadaster_list = self.scrap_random_until_x_times_found(1)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_loading_point_is_in_polygon_returns_true(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertTrue(polygon.is_point_in_polygon(lon=-5.295410156250001, lat=40.069664523297774))
def test_loading_point_is_not_in_polygon_returns_false(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertFalse(polygon.is_point_in_polygon(lon=-1.9335937500000002, lat=48.31242790407178))
def test_polygon_has_correct_bounding_box(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertIsNotNone(polygon.get_bounding_box())
def test_if_pictures_enabled_picture_is_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('06145A00500028', pictures=True)
self.assertIsNotNone(cadaster_list[0].picture)
if __name__ == '__main__':
unittest.main()

View File

@ -6,42 +6,12 @@ import unittest
from time import sleep
from src.librecatastro.domain.cadaster_entry.cadaster_entry_xml import CadasterEntryXML
from src.librecatastro.scrapping.parsers.parser_xml import ScrapperXML, ParserXML
from src.librecatastro.scrapping.parsers.parser_xml import ParserXML
from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
class ScrapperXMLTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self):
try:
self.assertEqual(ScrapperXML.get_provinces().consulta_provinciero.control.cuprov, '48')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_cities(self):
try:
self.assertEqual(ScrapperXML.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_addresses(self):
try:
self.assertEqual(ScrapperXML.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117')
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_get_cadaster_entries_by_cadaster_is_up(self):
cadasters = ['2503906VK4820D0001MX']
try:
for cadaster in cadasters:
ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster)
except:
self.assertFalse(config['servers_down_message'])
exit(-1)
def test_scrapper_retrieves_dict_addresses_iter(self):
iterator = ScrapperXML.get_address_iter()

View File

View File

@ -0,0 +1,48 @@
import os
import unittest
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.parsers.parser_html import ParserHTML
from src.librecatastro.scrapping.searchers.coordinates_searcher import CoordinatesSearcher
from src.settings import config
class CoordinatesSearcherTests(unittest.TestCase):
def search_random_until_x_times_found_by_html(self, times):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box()
cadaster_list = CoordinatesSearcher.search_by_coordinates_random_max_n_matches(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ParserHTML)
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
def test_search_random_until_5_found(self):
self.search_random_until_x_times_found_by_html(5)
def test_search_random_until_5_is_stored_in_elasticsearch(self):
cadaster_list = self.search_random_until_x_times_found_by_html(5)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_search_random_until_1_is_stored_in_elasticsearch(self):
cadaster_list = self.search_random_until_x_times_found_by_html(1)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
def test_loading_point_is_in_polygon_returns_true(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertTrue(polygon.is_point_in_polygon(lon=-5.295410156250001, lat=40.069664523297774))
def test_loading_point_is_not_in_polygon_returns_false(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertFalse(polygon.is_point_in_polygon(lon=-1.9335937500000002, lat=48.31242790407178))
def test_polygon_has_correct_bounding_box(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertIsNotNone(polygon.get_bounding_box())
if __name__ == '__main__':
unittest.main()

View File

View File

@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import unittest
from src.librecatastro.scrapping.scrapper import Scrapper
from src.librecatastro.scrapping.scrappers.scrapper_xml import ScrapperXML
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ServerHealthTests(unittest.TestCase):
def test_scrapper_retrieves_dict_provinces(self):
try:
self.assertEqual(Scrapper.get_provinces().consulta_provinciero.control.cuprov, '48')
except:
logger.debug(config['servers_down_message_001'])
def test_scrapper_retrieves_dict_cities(self):
try:
self.assertEqual(Scrapper.get_cities('ALACANT').consulta_municipiero.control.cumun, '141')
except:
logger.debug(config['servers_down_message_001'])
def test_scrapper_retrieves_dict_addresses(self):
try:
self.assertEqual(Scrapper.get_addresses('ALACANT', 'AGOST').consulta_callejero.control.cuca, '117')
except:
logger.debug(config['servers_down_message_001'])
def test_get_cadaster_entries_by_cadaster_is_up(self):
cadasters = ['2503906VK4820D0001MX']
try:
for cadaster in cadasters:
ScrapperXML.get_cadaster_entries_by_cadaster('', '', cadaster)
except:
logger.debug(config['servers_down_message_002'])
@staticmethod
def healthcheck():
suite = unittest.TestLoader().loadTestsFromTestCase(ServerHealthTests)
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
unittest.main()