Checks if address already present in ElasticSearch and skips it. Adds ENV var to docker-compose

This commit is contained in:
J 2019-09-23 13:01:05 +02:00
parent c8ec760ef2
commit d38f0905ee
13 changed files with 137 additions and 27 deletions

2
.env Normal file
View File

@ -0,0 +1,2 @@
ELK_DATA=/Users/josejuan.martinez/ELK_DATA
#ELK_DATA=/home/ubuntu/ELK_DATA

View File

@ -10,7 +10,7 @@ services:
- bootstrap.memory_lock=true - bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m" - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes: volumes:
- /home/ubuntu/ELK_data/elasticsearch:/usr/share/elasticsearch/data:rw - ${ELK_DATA}/elasticsearch:/usr/share/elasticsearch/data:rw
hostname: elasticsearch hostname: elasticsearch
ulimits: ulimits:
memlock: memlock:
@ -34,7 +34,7 @@ services:
depends_on: depends_on:
- elasticsearch - elasticsearch
volumes: volumes:
- /home/ubuntu/ELK_data/logstash/csv:/shared:rw - ${ELK_DATA}/logstash/csv:/shared:rw
hostname: logstash hostname: logstash
networks: networks:
- ELK - ELK

View File

@ -4,6 +4,7 @@
import json import json
from abc import abstractmethod from abc import abstractmethod
from dotmap import DotMap
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from src.settings import config from src.settings import config
@ -53,11 +54,15 @@ class CadasterEntry:
return res return res
def from_elasticsearch(self): def from_elasticsearch(self):
res = None res = False
es = Elasticsearch() es = Elasticsearch()
try: try:
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}' query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
res = es.search(index=config['elasticsearch-index'], body=query) res = es.search(index=config['elasticsearch-index'], body=query)
hits = DotMap(res).hits.total
if hits == DotMap():
hits = 0
res = (hits > 0)
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)

View File

@ -14,6 +14,7 @@ from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
from src.settings import config from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
from src.utils.elasticsearch_utils import ElasticSearchUtils
'''Logger''' '''Logger'''
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
@ -70,6 +71,11 @@ class ParserHTML(Parser):
if tv == DotMap() or nv == DotMap(): if tv == DotMap() or nv == DotMap():
continue continue
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
prov_name, city_name))
continue
num_scrapping_fails = 10 num_scrapping_fails = 10
counter = 1 counter = 1
while num_scrapping_fails > 0: while num_scrapping_fails > 0:

View File

@ -15,6 +15,7 @@ from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap from dotmap import DotMap
from src.utils.elasticsearch_utils import ElasticSearchUtils
from src.utils.list_utils import ListUtils from src.utils.list_utils import ListUtils
'''Logger''' '''Logger'''
@ -135,6 +136,11 @@ class ParserXML(Parser):
if tv == DotMap() or nv == DotMap(): if tv == DotMap() or nv == DotMap():
continue continue
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
prov_name, city_name))
continue
num_scrapping_fails = 10 num_scrapping_fails = 10
counter = 1 counter = 1
while num_scrapping_fails > 0: while num_scrapping_fails > 0:

View File

@ -8,6 +8,7 @@ root_path = os.path.dirname(os.path.abspath(__file__))
config = { config = {
"separator": "####", "separator": "####",
"elasticsearch-index": "cadaster", "elasticsearch-index": "cadaster",
"elasticsearch-doc": "cadaster_doc",
"error_log_file": os.path.join(root_path, 'logs', 'log'), "error_log_file": os.path.join(root_path, 'logs', 'log'),
"tracking_log_file": os.path.join(root_path, 'logs', 'track'), "tracking_log_file": os.path.join(root_path, 'logs', 'track'),
"scale": 10000, "scale": 10000,

View File

@ -10,13 +10,13 @@ class ParserHTMLTests(unittest.TestCase):
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521) cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14) self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list: for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self): def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1) cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 14) self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list: for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
def test_search_site_lot_is_set(self): def test_search_site_lot_is_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003') cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')

View File

@ -9,13 +9,13 @@ class ParserXMLTests(unittest.TestCase):
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521) cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14) self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list: for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self): def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1) cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 1) self.assertEqual(len(cadaster_list), 1)
for cadaster in cadaster_list: for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -36,7 +36,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
counter += 1 counter += 1
self.assertEqual(counter, 2) self.assertEqual(counter, 2)
@ -57,7 +57,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self): def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
@ -76,7 +76,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self): def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
@ -96,7 +96,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch() cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch()) self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time']) sleep(config['sleep_time'])
def test_multiparcela_coord_creates_n_entries(self): def test_multiparcela_coord_creates_n_entries(self):

View File

@ -23,13 +23,13 @@ class CoordinatesSearcherTests(unittest.TestCase):
cadaster_list = self.search_random_until_x_times_found_by_html(5) cadaster_list = self.search_random_until_x_times_found_by_html(5)
for cadaster in cadaster_list: for cadaster in cadaster_list:
cadaster.to_elasticsearch() cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
def test_search_random_until_1_is_stored_in_elasticsearch(self): def test_search_random_until_1_is_stored_in_elasticsearch(self):
cadaster_list = self.search_random_until_x_times_found_by_html(1) cadaster_list = self.search_random_until_x_times_found_by_html(1)
for cadaster in cadaster_list: for cadaster in cadaster_list:
cadaster.to_elasticsearch() cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch()) self.assertTrue(cadaster.from_elasticsearch())
def test_loading_point_is_in_polygon_returns_true(self): def test_loading_point_is_in_polygon_returns_true(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json')) polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))

View File

View File

@ -0,0 +1,59 @@
import unittest
from datetime import datetime
from time import sleep
from dotmap import DotMap
from src.librecatastro.domain.address import Address
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
from src.utils.elasticsearch_utils import ElasticSearchUtils
class ElasticSearchTests(unittest.TestCase):
def setup_environment(self):
ElasticSearchUtils.remove_index()
sleep(5)
ElasticSearchUtils.create_index()
sleep(5)
def insert_stores_document_in_elasticsearch(self):
cadaster = DotMap()
cadaster.address = Address("CL TESTTEST 17 03005 AJALVIR (MURCIA)")
cadaster.cadaster = "AAAAA"
cadaster.type = "Urbano"
cadaster.use = "Religioso"
cadaster.surface = "100m2"
cadaster.year = "1970"
cadaster.location = None
cadaster.gsurface = "1200m2"
cadaster.constructions = None
cadaster.picture = None
cadaster.timestamp = str(datetime.now())
cadaster_entry = CadasterEntry(cadaster)
cadaster_entry.to_elasticsearch()
sleep(5)
self.assertTrue(cadaster_entry.from_elasticsearch())
def search_retrieves_document_from_elasticsearch(self):
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 17", "AJALVIR", "MURCIA")
self.assertTrue(res)
def search_does_not_retrieve_document_from_elasticsearch(self):
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 25", "AJALVIR", "MURCIA")
self.assertFalse(res)
def test_run_tests(self):
self.setup_environment()
self.insert_stores_document_in_elasticsearch()
self.search_retrieves_document_from_elasticsearch()
self.search_does_not_retrieve_document_from_elasticsearch()
if __name__ == '__main__':
unittest.main()

View File

@ -1,5 +1,7 @@
from dotmap import DotMap
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger logger = CadastroLogger(__name__).logger
@ -66,26 +68,55 @@ class ElasticSearchUtils:
} }
} }
logger.debug("Creating 'cadaster' index...") logger.debug("Creating 'cadaster' index...")
res = es.indices.create(index='cadaster', body=request_body) try:
logger.debug(res) res = es.indices.create(index='cadaster', body=request_body)
logger.debug(res)
except Exception as e:
logger.debug(e)
es.transport.close()
@staticmethod @staticmethod
def remove_index(): def remove_index():
es = Elasticsearch() es = Elasticsearch()
logger.debug("Deleting 'cadaster' index...") logger.debug("Deleting 'cadaster' index...")
res = es.indices.delete(index='cadaster', ignore=[400, 404]) try:
logger.debug(res) res = es.indices.delete(index='cadaster', ignore=[400, 404])
logger.debug(res)
except Exception as e:
logger.debug(e)
es.transport.close()
@staticmethod @staticmethod
def create_index_companies(): def check_if_address_present(address, city_name, province_name):
res = False
query = {"query":
{"bool":
{"must":
[{"prefix":
{"address.full_address.keyword":"{}".format(address)}},
{"match":
{"address.province.keyword":"{}".format(province_name)}},
{"match":{"address.city.keyword":"{}".format(city_name)}}],
"must_not":[],
"should":[]}},
"from":0,
"size":11,
"sort":[],
"aggs":{}}
es = Elasticsearch() es = Elasticsearch()
request_body = { try:
"settings": { res = es.search(config['elasticsearch-index'], config['elasticsearch-doc'], query)
"number_of_shards": 5, hits = DotMap(res).hits.total
"number_of_replicas": 1 if hits == DotMap():
}, hits = 0
} res = (hits > 0)
logger.debug("Creating 'borme' index...") logger.debug("Found in ES: {}".format(hits))
res = es.indices.create(index='borme', body=request_body) except Exception as e:
logger.debug(res) logger.debug(e)
es.transport.close()
return res