mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-06-02 23:05:35 +02:00
Checks if address already present in ElasticSearch and skips it. Adds ENV var to docker-compose
This commit is contained in:
parent
c8ec760ef2
commit
d38f0905ee
2
.env
Normal file
2
.env
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
ELK_DATA=/Users/josejuan.martinez/ELK_DATA
|
||||||
|
#ELK_DATA=/home/ubuntu/ELK_DATA
|
|
@ -10,7 +10,7 @@ services:
|
||||||
- bootstrap.memory_lock=true
|
- bootstrap.memory_lock=true
|
||||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||||
volumes:
|
volumes:
|
||||||
- /home/ubuntu/ELK_data/elasticsearch:/usr/share/elasticsearch/data:rw
|
- ${ELK_DATA}/elasticsearch:/usr/share/elasticsearch/data:rw
|
||||||
hostname: elasticsearch
|
hostname: elasticsearch
|
||||||
ulimits:
|
ulimits:
|
||||||
memlock:
|
memlock:
|
||||||
|
@ -34,7 +34,7 @@ services:
|
||||||
depends_on:
|
depends_on:
|
||||||
- elasticsearch
|
- elasticsearch
|
||||||
volumes:
|
volumes:
|
||||||
- /home/ubuntu/ELK_data/logstash/csv:/shared:rw
|
- ${ELK_DATA}/logstash/csv:/shared:rw
|
||||||
hostname: logstash
|
hostname: logstash
|
||||||
networks:
|
networks:
|
||||||
- ELK
|
- ELK
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
import json
|
import json
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
from dotmap import DotMap
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
|
@ -53,11 +54,15 @@ class CadasterEntry:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def from_elasticsearch(self):
|
def from_elasticsearch(self):
|
||||||
res = None
|
res = False
|
||||||
es = Elasticsearch()
|
es = Elasticsearch()
|
||||||
try:
|
try:
|
||||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||||
|
hits = DotMap(res).hits.total
|
||||||
|
if hits == DotMap():
|
||||||
|
hits = 0
|
||||||
|
res = (hits > 0)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
|
||||||
from src.settings import config
|
from src.settings import config
|
||||||
|
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||||
|
|
||||||
'''Logger'''
|
'''Logger'''
|
||||||
logger = CadastroLogger(__name__).logger
|
logger = CadastroLogger(__name__).logger
|
||||||
|
@ -70,6 +71,11 @@ class ParserHTML(Parser):
|
||||||
if tv == DotMap() or nv == DotMap():
|
if tv == DotMap() or nv == DotMap():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
|
||||||
|
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
|
||||||
|
prov_name, city_name))
|
||||||
|
continue
|
||||||
|
|
||||||
num_scrapping_fails = 10
|
num_scrapping_fails = 10
|
||||||
counter = 1
|
counter = 1
|
||||||
while num_scrapping_fails > 0:
|
while num_scrapping_fails > 0:
|
||||||
|
|
|
@ -15,6 +15,7 @@ from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
from dotmap import DotMap
|
from dotmap import DotMap
|
||||||
|
|
||||||
|
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||||
from src.utils.list_utils import ListUtils
|
from src.utils.list_utils import ListUtils
|
||||||
|
|
||||||
'''Logger'''
|
'''Logger'''
|
||||||
|
@ -135,6 +136,11 @@ class ParserXML(Parser):
|
||||||
if tv == DotMap() or nv == DotMap():
|
if tv == DotMap() or nv == DotMap():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
|
||||||
|
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
|
||||||
|
prov_name, city_name))
|
||||||
|
continue
|
||||||
|
|
||||||
num_scrapping_fails = 10
|
num_scrapping_fails = 10
|
||||||
counter = 1
|
counter = 1
|
||||||
while num_scrapping_fails > 0:
|
while num_scrapping_fails > 0:
|
||||||
|
|
|
@ -8,6 +8,7 @@ root_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
config = {
|
config = {
|
||||||
"separator": "####",
|
"separator": "####",
|
||||||
"elasticsearch-index": "cadaster",
|
"elasticsearch-index": "cadaster",
|
||||||
|
"elasticsearch-doc": "cadaster_doc",
|
||||||
"error_log_file": os.path.join(root_path, 'logs', 'log'),
|
"error_log_file": os.path.join(root_path, 'logs', 'log'),
|
||||||
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
|
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
|
||||||
"scale": 10000,
|
"scale": 10000,
|
||||||
|
|
|
@ -10,13 +10,13 @@ class ParserHTMLTests(unittest.TestCase):
|
||||||
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
||||||
self.assertEqual(len(cadaster_list), 14)
|
self.assertEqual(len(cadaster_list), 14)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||||
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
|
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||||
self.assertEqual(len(cadaster_list), 14)
|
self.assertEqual(len(cadaster_list), 14)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
def test_search_site_lot_is_set(self):
|
def test_search_site_lot_is_set(self):
|
||||||
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
|
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
|
||||||
|
|
|
@ -9,13 +9,13 @@ class ParserXMLTests(unittest.TestCase):
|
||||||
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
||||||
self.assertEqual(len(cadaster_list), 14)
|
self.assertEqual(len(cadaster_list), 14)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||||
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
|
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||||
self.assertEqual(len(cadaster_list), 1)
|
self.assertEqual(len(cadaster_list), 1)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -36,7 +36,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||||
sleep(config['sleep_time'])
|
sleep(config['sleep_time'])
|
||||||
counter += 1
|
counter += 1
|
||||||
self.assertEqual(counter, 2)
|
self.assertEqual(counter, 2)
|
||||||
|
@ -57,7 +57,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||||
sleep(config['sleep_time'])
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
|
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
|
||||||
|
@ -76,7 +76,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||||
sleep(config['sleep_time'])
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
|
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
|
||||||
|
@ -96,7 +96,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
||||||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||||
cadaster_entry.to_elasticsearch()
|
cadaster_entry.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||||
sleep(config['sleep_time'])
|
sleep(config['sleep_time'])
|
||||||
|
|
||||||
def test_multiparcela_coord_creates_n_entries(self):
|
def test_multiparcela_coord_creates_n_entries(self):
|
||||||
|
|
|
@ -23,13 +23,13 @@ class CoordinatesSearcherTests(unittest.TestCase):
|
||||||
cadaster_list = self.search_random_until_x_times_found_by_html(5)
|
cadaster_list = self.search_random_until_x_times_found_by_html(5)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
cadaster.to_elasticsearch()
|
cadaster.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
def test_search_random_until_1_is_stored_in_elasticsearch(self):
|
def test_search_random_until_1_is_stored_in_elasticsearch(self):
|
||||||
cadaster_list = self.search_random_until_x_times_found_by_html(1)
|
cadaster_list = self.search_random_until_x_times_found_by_html(1)
|
||||||
for cadaster in cadaster_list:
|
for cadaster in cadaster_list:
|
||||||
cadaster.to_elasticsearch()
|
cadaster.to_elasticsearch()
|
||||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
self.assertTrue(cadaster.from_elasticsearch())
|
||||||
|
|
||||||
def test_loading_point_is_in_polygon_returns_true(self):
|
def test_loading_point_is_in_polygon_returns_true(self):
|
||||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||||
|
|
0
src/tests/utils/__init__.py
Normal file
0
src/tests/utils/__init__.py
Normal file
59
src/tests/utils/elasticsearch_tests.py
Normal file
59
src/tests/utils/elasticsearch_tests.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
import unittest
|
||||||
|
from datetime import datetime
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
from dotmap import DotMap
|
||||||
|
|
||||||
|
from src.librecatastro.domain.address import Address
|
||||||
|
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
|
||||||
|
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticSearchTests(unittest.TestCase):
|
||||||
|
|
||||||
|
def setup_environment(self):
|
||||||
|
ElasticSearchUtils.remove_index()
|
||||||
|
sleep(5)
|
||||||
|
ElasticSearchUtils.create_index()
|
||||||
|
sleep(5)
|
||||||
|
|
||||||
|
def insert_stores_document_in_elasticsearch(self):
|
||||||
|
|
||||||
|
cadaster = DotMap()
|
||||||
|
cadaster.address = Address("CL TESTTEST 17 03005 AJALVIR (MURCIA)")
|
||||||
|
cadaster.cadaster = "AAAAA"
|
||||||
|
cadaster.type = "Urbano"
|
||||||
|
cadaster.use = "Religioso"
|
||||||
|
cadaster.surface = "100m2"
|
||||||
|
cadaster.year = "1970"
|
||||||
|
cadaster.location = None
|
||||||
|
cadaster.gsurface = "1200m2"
|
||||||
|
cadaster.constructions = None
|
||||||
|
cadaster.picture = None
|
||||||
|
cadaster.timestamp = str(datetime.now())
|
||||||
|
|
||||||
|
cadaster_entry = CadasterEntry(cadaster)
|
||||||
|
|
||||||
|
cadaster_entry.to_elasticsearch()
|
||||||
|
|
||||||
|
sleep(5)
|
||||||
|
|
||||||
|
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||||
|
|
||||||
|
def search_retrieves_document_from_elasticsearch(self):
|
||||||
|
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 17", "AJALVIR", "MURCIA")
|
||||||
|
self.assertTrue(res)
|
||||||
|
|
||||||
|
def search_does_not_retrieve_document_from_elasticsearch(self):
|
||||||
|
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 25", "AJALVIR", "MURCIA")
|
||||||
|
self.assertFalse(res)
|
||||||
|
|
||||||
|
def test_run_tests(self):
|
||||||
|
self.setup_environment()
|
||||||
|
self.insert_stores_document_in_elasticsearch()
|
||||||
|
self.search_retrieves_document_from_elasticsearch()
|
||||||
|
self.search_does_not_retrieve_document_from_elasticsearch()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
|
@ -1,5 +1,7 @@
|
||||||
|
from dotmap import DotMap
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
|
from src.settings import config
|
||||||
from src.utils.cadastro_logger import CadastroLogger
|
from src.utils.cadastro_logger import CadastroLogger
|
||||||
|
|
||||||
logger = CadastroLogger(__name__).logger
|
logger = CadastroLogger(__name__).logger
|
||||||
|
@ -66,26 +68,55 @@ class ElasticSearchUtils:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger.debug("Creating 'cadaster' index...")
|
logger.debug("Creating 'cadaster' index...")
|
||||||
res = es.indices.create(index='cadaster', body=request_body)
|
try:
|
||||||
logger.debug(res)
|
res = es.indices.create(index='cadaster', body=request_body)
|
||||||
|
logger.debug(res)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(e)
|
||||||
|
|
||||||
|
es.transport.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_index():
|
def remove_index():
|
||||||
es = Elasticsearch()
|
es = Elasticsearch()
|
||||||
logger.debug("Deleting 'cadaster' index...")
|
logger.debug("Deleting 'cadaster' index...")
|
||||||
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
try:
|
||||||
logger.debug(res)
|
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
||||||
|
logger.debug(res)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(e)
|
||||||
|
|
||||||
|
es.transport.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_index_companies():
|
def check_if_address_present(address, city_name, province_name):
|
||||||
|
res = False
|
||||||
|
query = {"query":
|
||||||
|
{"bool":
|
||||||
|
{"must":
|
||||||
|
[{"prefix":
|
||||||
|
{"address.full_address.keyword":"{}".format(address)}},
|
||||||
|
{"match":
|
||||||
|
{"address.province.keyword":"{}".format(province_name)}},
|
||||||
|
{"match":{"address.city.keyword":"{}".format(city_name)}}],
|
||||||
|
"must_not":[],
|
||||||
|
"should":[]}},
|
||||||
|
"from":0,
|
||||||
|
"size":11,
|
||||||
|
"sort":[],
|
||||||
|
"aggs":{}}
|
||||||
es = Elasticsearch()
|
es = Elasticsearch()
|
||||||
request_body = {
|
try:
|
||||||
"settings": {
|
res = es.search(config['elasticsearch-index'], config['elasticsearch-doc'], query)
|
||||||
"number_of_shards": 5,
|
hits = DotMap(res).hits.total
|
||||||
"number_of_replicas": 1
|
if hits == DotMap():
|
||||||
},
|
hits = 0
|
||||||
}
|
res = (hits > 0)
|
||||||
logger.debug("Creating 'borme' index...")
|
logger.debug("Found in ES: {}".format(hits))
|
||||||
res = es.indices.create(index='borme', body=request_body)
|
except Exception as e:
|
||||||
logger.debug(res)
|
logger.debug(e)
|
||||||
|
|
||||||
|
es.transport.close()
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user