Manually closes ElasticSearch socket. Fixes / Updates tests.

This commit is contained in:
J 2019-09-18 19:11:04 +02:00
parent 9f7d5fda51
commit 50d4ad6e93
7 changed files with 49 additions and 21 deletions

View File

@ -12,6 +12,7 @@ if __name__ == "__main__":
args = parser.parse_args(sys.argv[1:])
if args.coords:
print(args.filenames)
ScrapperHTML.scrap_all_coordinates_files(args.filenames)
else:
ScrapperXML.scrap_all_addresses(args.provinces)

View File

@ -158,6 +158,7 @@ class Address:
return self.city
city_text = self.second_line.replace(self.province_parentheses, '')
city_text = city_text.replace(config['separator'],'').strip()
if self.cp is not None:
city_text = city_text.replace(self.cp, '')

View File

@ -4,8 +4,12 @@ from abc import abstractmethod
from elasticsearch import Elasticsearch
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.json_encoder import JSONEncoder
'''Logger'''
logger = CadastroLogger(__name__).logger
class CadasterEntry:
@ -30,18 +34,30 @@ class CadasterEntry:
indent=4, separators=(',', ': '))
def to_elasticsearch(self):
es = Elasticsearch()
body = json.dumps(self.to_json(), cls=JSONEncoder,sort_keys=True,
indent=4, separators=(',', ': '))
try:
es = Elasticsearch()
body = json.dumps(self.to_json(), cls=JSONEncoder,sort_keys=True,
indent=4, separators=(',', ': '))
#logger.debug("Sending to Elastic Search\n:{}".format(body))
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=body)
#logger.debug(res)
except Exception as e:
logger.error(e)
finally:
es.transport.close()
return res
def from_elasticsearch(self):
es = Elasticsearch()
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
res = es.search(index=config['elasticsearch-index'], body=query)
#logger.debug(res)
res = None
try:
es = Elasticsearch()
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
res = es.search(index=config['elasticsearch-index'], body=query)
except Exception as e:
logger.error(e)
finally:
es.transport.close()
return res

View File

@ -31,4 +31,7 @@ class GeoPolygon:
return self.polygon.contains(p)
def get_bounding_box(self):
pass
if self.polygon is not None:
return self.polygon.bounds
else:
return None

View File

@ -11,7 +11,6 @@ from xml.etree import ElementTree
from bs4 import BeautifulSoup
from src.librecatastro.domain.cadaster_entry.cadaster_entry_html import CadasterEntryHTML
from src.librecatastro.domain.geometry.geo_bounding_box import GeoBoundingBox
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
@ -33,7 +32,6 @@ class ScrapperHTML(Scrapper):
@classmethod
def scrap_all_coordinates_files(cls, filenames):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
@ -52,15 +50,17 @@ class ScrapperHTML(Scrapper):
@staticmethod
def scrap_polygon(polygon):
bb = polygon.get_bounding_box()
lon_min = 0
lon_max = 0
lat_min = 0
lat_max = 0
lon_min = int(bb[0] * config['scale'])
lon_max = int(bb[2] * config['scale'])
lat_min = int(bb[1] * config['scale'])
lat_max = int(bb[3] * config['scale'])
for x in range(lon_min, lon_max):
for y in range(lat_min, lat_max):
x_scaled = x / config['scale']
y_scaled = y / config['scale']
if not polygon.is_point_in_polygon(x_scaled, y_scaled):
continue
''' Adding to tracking file'''
logger.info('{},{}'.format(x_scaled, y_scaled))
@ -263,7 +263,11 @@ class ScrapperHTML(Scrapper):
descriptive_data[field_name] = field_value.text.strip()
'''Constructions'''
constructions = parsed_html.find(id='ctl00_Contenido_tblLocales').find_all('tr')
constructions_table = parsed_html.find(id='ctl00_Contenido_tblLocales');
if constructions_table is None:
constructions = []
else:
constructions = constructions_table.find_all('tr')
header = True
for construction in constructions:
if header:
@ -273,8 +277,6 @@ class ScrapperHTML(Scrapper):
descriptive_data[u'Construcciones'].append(dict(uso=columns[0].text, escalera=columns[1].text, planta=columns[2].text, puerta=columns[3].text, superficie=columns[4].text, tipo=columns[5].text, fecha=columns[6].text))
cadaster_entry = CadasterEntryHTML(descriptive_data)
return cadaster_entry

View File

@ -7,7 +7,7 @@ config = {
"elasticsearch-index": "cadaster",
"error_log_file": os.path.join(root_path, 'logs', 'log'),
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
"scale": 1000000,
"scale": 10000,
"coordinates_path": os.path.join(root_path, 'coordinates'),
"not_available_via_XML": "(Not available via XML)",
"sleep_time": 5,

View File

@ -89,8 +89,9 @@ class ScrapperHTMLTests(unittest.TestCase):
self.assertIsNotNone(cadaster.from_elasticsearch())
def scrap_random_until_x_times_found(self, times):
coord = GeoBoundingBox.get_bb_from_file(os.path.join(config['coordinates_path'], 'central_peninsulae.json'))
cadaster_list = ScrapperHTML.scrap_results_random_x_times(times, coord[0], coord[1], coord[2], coord[3])
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box()
cadaster_list = ScrapperHTML.scrap_results_random_x_times(times, int(coord[0]*config['scale']), int(coord[2]*config['scale']), int(coord[1]*config['scale']), int(coord[3]*config['scale']))
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list
@ -117,6 +118,10 @@ class ScrapperHTMLTests(unittest.TestCase):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertFalse(polygon.is_point_in_polygon(lon=-1.9335937500000002, lat=48.31242790407178))
def test_polygon_has_correct_bounding_box(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
self.assertIsNotNone(polygon.get_bounding_box())
if __name__ == '__main__':
unittest.main()