diff --git a/.env b/.env new file mode 100644 index 0000000..0a127dd --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +ELK_DATA=/Users/josejuan.martinez/ELK_DATA +#ELK_DATA=/home/ubuntu/ELK_DATA \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index e4e7e2f..d4526e4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,7 +10,7 @@ services: - bootstrap.memory_lock=true - "ES_JAVA_OPTS=-Xms512m -Xmx512m" volumes: - - /home/ubuntu/ELK_data/elasticsearch:/usr/share/elasticsearch/data:rw + - ${ELK_DATA}/elasticsearch:/usr/share/elasticsearch/data:rw hostname: elasticsearch ulimits: memlock: @@ -34,7 +34,7 @@ services: depends_on: - elasticsearch volumes: - - /home/ubuntu/ELK_data/logstash/csv:/shared:rw + - ${ELK_DATA}/logstash/csv:/shared:rw hostname: logstash networks: - ELK diff --git a/src/librecatastro/domain/cadaster_entry/cadaster_entry.py b/src/librecatastro/domain/cadaster_entry/cadaster_entry.py index 849f606..cb2b5f5 100644 --- a/src/librecatastro/domain/cadaster_entry/cadaster_entry.py +++ b/src/librecatastro/domain/cadaster_entry/cadaster_entry.py @@ -4,6 +4,7 @@ import json from abc import abstractmethod +from dotmap import DotMap from elasticsearch import Elasticsearch from src.settings import config @@ -53,11 +54,15 @@ class CadasterEntry: return res def from_elasticsearch(self): - res = None + res = False es = Elasticsearch() try: query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}' res = es.search(index=config['elasticsearch-index'], body=query) + hits = DotMap(res).hits.total + if hits == DotMap(): + hits = 0 + res = (hits > 0) except Exception as e: logger.error(e) diff --git a/src/librecatastro/scrapping/parsers/parser_html.py b/src/librecatastro/scrapping/parsers/parser_html.py index b924755..a534fd6 100644 --- a/src/librecatastro/scrapping/parsers/parser_html.py +++ b/src/librecatastro/scrapping/parsers/parser_html.py @@ -14,6 +14,7 @@ from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML from src.settings import config from src.utils.cadastro_logger import CadastroLogger +from src.utils.elasticsearch_utils import ElasticSearchUtils '''Logger''' logger = CadastroLogger(__name__).logger @@ -70,6 +71,11 @@ class ParserHTML(Parser): if tv == DotMap() or nv == DotMap(): continue + if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name): + logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv, + prov_name, city_name)) + continue + num_scrapping_fails = 10 counter = 1 while num_scrapping_fails > 0: diff --git a/src/librecatastro/scrapping/parsers/parser_xml.py b/src/librecatastro/scrapping/parsers/parser_xml.py index 718e6a9..c0c9666 100644 --- a/src/librecatastro/scrapping/parsers/parser_xml.py +++ b/src/librecatastro/scrapping/parsers/parser_xml.py @@ -15,6 +15,7 @@ from src.utils.cadastro_logger import CadastroLogger from dotmap import DotMap +from src.utils.elasticsearch_utils import ElasticSearchUtils from src.utils.list_utils import ListUtils '''Logger''' @@ -135,6 +136,11 @@ class ParserXML(Parser): if tv == DotMap() or nv == DotMap(): continue + if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name): + logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv, + prov_name, city_name)) + continue + num_scrapping_fails = 10 counter = 1 while num_scrapping_fails > 0: diff --git a/src/settings.py b/src/settings.py index 28592c4..eec3db3 100644 --- a/src/settings.py +++ b/src/settings.py @@ -8,6 +8,7 @@ root_path = os.path.dirname(os.path.abspath(__file__)) config = { "separator": "####", "elasticsearch-index": "cadaster", + "elasticsearch-doc": "cadaster_doc", "error_log_file": os.path.join(root_path, 'logs', 'log'), "tracking_log_file": os.path.join(root_path, 'logs', 'track'), "scale": 10000, diff --git a/src/tests/parsers/parser_html_tests.py b/src/tests/parsers/parser_html_tests.py index 3f67768..00da22e 100644 --- a/src/tests/parsers/parser_html_tests.py +++ b/src/tests/parsers/parser_html_tests.py @@ -10,13 +10,13 @@ class ParserHTMLTests(unittest.TestCase): cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521) self.assertEqual(len(cadaster_list), 14) for cadaster in cadaster_list: - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) def test_search_by_provinces_creates_and_stores_in_elasticsearch(self): cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1) self.assertEqual(len(cadaster_list), 14) for cadaster in cadaster_list: - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) def test_search_site_lot_is_set(self): cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003') diff --git a/src/tests/parsers/parser_xml_tests.py b/src/tests/parsers/parser_xml_tests.py index 3a397fc..0722870 100644 --- a/src/tests/parsers/parser_xml_tests.py +++ b/src/tests/parsers/parser_xml_tests.py @@ -9,13 +9,13 @@ class ParserXMLTests(unittest.TestCase): cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521) self.assertEqual(len(cadaster_list), 14) for cadaster in cadaster_list: - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) def test_search_by_provinces_creates_and_stores_in_elasticsearch(self): cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1) self.assertEqual(len(cadaster_list), 1) for cadaster in cadaster_list: - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) if __name__ == '__main__': diff --git a/src/tests/scrappers/scrapper_xml_tests.py b/src/tests/scrappers/scrapper_xml_tests.py index f02baaf..dd76b93 100644 --- a/src/tests/scrappers/scrapper_xml_tests.py +++ b/src/tests/scrappers/scrapper_xml_tests.py @@ -36,7 +36,7 @@ class ScrapperXMLTests(unittest.TestCase): sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry.to_elasticsearch() - self.assertIsNotNone(cadaster_entry.from_elasticsearch()) + self.assertTrue(cadaster_entry.from_elasticsearch()) sleep(config['sleep_time']) counter += 1 self.assertEqual(counter, 2) @@ -57,7 +57,7 @@ class ScrapperXMLTests(unittest.TestCase): sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry.to_elasticsearch() - self.assertIsNotNone(cadaster_entry.from_elasticsearch()) + self.assertTrue(cadaster_entry.from_elasticsearch()) sleep(config['sleep_time']) def test_no_es_pt_pu_creates_entry_in_elasticsearch(self): @@ -76,7 +76,7 @@ class ScrapperXMLTests(unittest.TestCase): sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry.to_elasticsearch() - self.assertIsNotNone(cadaster_entry.from_elasticsearch()) + self.assertTrue(cadaster_entry.from_elasticsearch()) sleep(config['sleep_time']) def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self): @@ -96,7 +96,7 @@ class ScrapperXMLTests(unittest.TestCase): sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster) cadaster_entry = CadasterEntryXML(sub_entry) cadaster_entry.to_elasticsearch() - self.assertIsNotNone(cadaster_entry.from_elasticsearch()) + self.assertTrue(cadaster_entry.from_elasticsearch()) sleep(config['sleep_time']) def test_multiparcela_coord_creates_n_entries(self): diff --git a/src/tests/searchers/coordinates_searcher_tests.py b/src/tests/searchers/coordinates_searcher_tests.py index da2c4f3..2989e76 100644 --- a/src/tests/searchers/coordinates_searcher_tests.py +++ b/src/tests/searchers/coordinates_searcher_tests.py @@ -23,13 +23,13 @@ class CoordinatesSearcherTests(unittest.TestCase): cadaster_list = self.search_random_until_x_times_found_by_html(5) for cadaster in cadaster_list: cadaster.to_elasticsearch() - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) def test_search_random_until_1_is_stored_in_elasticsearch(self): cadaster_list = self.search_random_until_x_times_found_by_html(1) for cadaster in cadaster_list: cadaster.to_elasticsearch() - self.assertIsNotNone(cadaster.from_elasticsearch()) + self.assertTrue(cadaster.from_elasticsearch()) def test_loading_point_is_in_polygon_returns_true(self): polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json')) diff --git a/src/tests/utils/__init__.py b/src/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/utils/elasticsearch_tests.py b/src/tests/utils/elasticsearch_tests.py new file mode 100644 index 0000000..c3dfc49 --- /dev/null +++ b/src/tests/utils/elasticsearch_tests.py @@ -0,0 +1,59 @@ +import unittest +from datetime import datetime +from time import sleep + +from dotmap import DotMap + +from src.librecatastro.domain.address import Address +from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry +from src.utils.elasticsearch_utils import ElasticSearchUtils + + +class ElasticSearchTests(unittest.TestCase): + + def setup_environment(self): + ElasticSearchUtils.remove_index() + sleep(5) + ElasticSearchUtils.create_index() + sleep(5) + + def insert_stores_document_in_elasticsearch(self): + + cadaster = DotMap() + cadaster.address = Address("CL TESTTEST 17 03005 AJALVIR (MURCIA)") + cadaster.cadaster = "AAAAA" + cadaster.type = "Urbano" + cadaster.use = "Religioso" + cadaster.surface = "100m2" + cadaster.year = "1970" + cadaster.location = None + cadaster.gsurface = "1200m2" + cadaster.constructions = None + cadaster.picture = None + cadaster.timestamp = str(datetime.now()) + + cadaster_entry = CadasterEntry(cadaster) + + cadaster_entry.to_elasticsearch() + + sleep(5) + + self.assertTrue(cadaster_entry.from_elasticsearch()) + + def search_retrieves_document_from_elasticsearch(self): + res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 17", "AJALVIR", "MURCIA") + self.assertTrue(res) + + def search_does_not_retrieve_document_from_elasticsearch(self): + res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 25", "AJALVIR", "MURCIA") + self.assertFalse(res) + + def test_run_tests(self): + self.setup_environment() + self.insert_stores_document_in_elasticsearch() + self.search_retrieves_document_from_elasticsearch() + self.search_does_not_retrieve_document_from_elasticsearch() + + +if __name__ == '__main__': + unittest.main() diff --git a/src/utils/elasticsearch_utils.py b/src/utils/elasticsearch_utils.py index 6f65ce5..a3787a2 100644 --- a/src/utils/elasticsearch_utils.py +++ b/src/utils/elasticsearch_utils.py @@ -1,5 +1,7 @@ +from dotmap import DotMap from elasticsearch import Elasticsearch +from src.settings import config from src.utils.cadastro_logger import CadastroLogger logger = CadastroLogger(__name__).logger @@ -66,26 +68,55 @@ class ElasticSearchUtils: } } logger.debug("Creating 'cadaster' index...") - res = es.indices.create(index='cadaster', body=request_body) - logger.debug(res) + try: + res = es.indices.create(index='cadaster', body=request_body) + logger.debug(res) + except Exception as e: + logger.debug(e) + + es.transport.close() @staticmethod def remove_index(): es = Elasticsearch() logger.debug("Deleting 'cadaster' index...") - res = es.indices.delete(index='cadaster', ignore=[400, 404]) - logger.debug(res) + try: + res = es.indices.delete(index='cadaster', ignore=[400, 404]) + logger.debug(res) + except Exception as e: + logger.debug(e) + + es.transport.close() @staticmethod - def create_index_companies(): + def check_if_address_present(address, city_name, province_name): + res = False + query = {"query": + {"bool": + {"must": + [{"prefix": + {"address.full_address.keyword":"{}".format(address)}}, + {"match": + {"address.province.keyword":"{}".format(province_name)}}, + {"match":{"address.city.keyword":"{}".format(city_name)}}], + "must_not":[], + "should":[]}}, + "from":0, + "size":11, + "sort":[], + "aggs":{}} es = Elasticsearch() - request_body = { - "settings": { - "number_of_shards": 5, - "number_of_replicas": 1 - }, - } - logger.debug("Creating 'borme' index...") - res = es.indices.create(index='borme', body=request_body) - logger.debug(res) + try: + res = es.search(config['elasticsearch-index'], config['elasticsearch-doc'], query) + hits = DotMap(res).hits.total + if hits == DotMap(): + hits = 0 + res = (hits > 0) + logger.debug("Found in ES: {}".format(hits)) + except Exception as e: + logger.debug(e) + + es.transport.close() + + return res