Checks if address already present in ElasticSearch and skips it. Adds ENV var to docker-compose
This commit is contained in:
parent
c8ec760ef2
commit
d38f0905ee
|
@ -0,0 +1,2 @@
|
|||
ELK_DATA=/Users/josejuan.martinez/ELK_DATA
|
||||
#ELK_DATA=/home/ubuntu/ELK_DATA
|
|
@ -10,7 +10,7 @@ services:
|
|||
- bootstrap.memory_lock=true
|
||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||
volumes:
|
||||
- /home/ubuntu/ELK_data/elasticsearch:/usr/share/elasticsearch/data:rw
|
||||
- ${ELK_DATA}/elasticsearch:/usr/share/elasticsearch/data:rw
|
||||
hostname: elasticsearch
|
||||
ulimits:
|
||||
memlock:
|
||||
|
@ -34,7 +34,7 @@ services:
|
|||
depends_on:
|
||||
- elasticsearch
|
||||
volumes:
|
||||
- /home/ubuntu/ELK_data/logstash/csv:/shared:rw
|
||||
- ${ELK_DATA}/logstash/csv:/shared:rw
|
||||
hostname: logstash
|
||||
networks:
|
||||
- ELK
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
import json
|
||||
from abc import abstractmethod
|
||||
|
||||
from dotmap import DotMap
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.settings import config
|
||||
|
@ -53,11 +54,15 @@ class CadasterEntry:
|
|||
return res
|
||||
|
||||
def from_elasticsearch(self):
|
||||
res = None
|
||||
res = False
|
||||
es = Elasticsearch()
|
||||
try:
|
||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||
hits = DotMap(res).hits.total
|
||||
if hits == DotMap():
|
||||
hits = 0
|
||||
res = (hits > 0)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
|
|||
from src.settings import config
|
||||
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
|
||||
'''Logger'''
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
@ -70,6 +71,11 @@ class ParserHTML(Parser):
|
|||
if tv == DotMap() or nv == DotMap():
|
||||
continue
|
||||
|
||||
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
|
||||
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
|
||||
prov_name, city_name))
|
||||
continue
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
while num_scrapping_fails > 0:
|
||||
|
|
|
@ -15,6 +15,7 @@ from src.utils.cadastro_logger import CadastroLogger
|
|||
|
||||
from dotmap import DotMap
|
||||
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
from src.utils.list_utils import ListUtils
|
||||
|
||||
'''Logger'''
|
||||
|
@ -135,6 +136,11 @@ class ParserXML(Parser):
|
|||
if tv == DotMap() or nv == DotMap():
|
||||
continue
|
||||
|
||||
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
|
||||
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
|
||||
prov_name, city_name))
|
||||
continue
|
||||
|
||||
num_scrapping_fails = 10
|
||||
counter = 1
|
||||
while num_scrapping_fails > 0:
|
||||
|
|
|
@ -8,6 +8,7 @@ root_path = os.path.dirname(os.path.abspath(__file__))
|
|||
config = {
|
||||
"separator": "####",
|
||||
"elasticsearch-index": "cadaster",
|
||||
"elasticsearch-doc": "cadaster_doc",
|
||||
"error_log_file": os.path.join(root_path, 'logs', 'log'),
|
||||
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
|
||||
"scale": 10000,
|
||||
|
|
|
@ -10,13 +10,13 @@ class ParserHTMLTests(unittest.TestCase):
|
|||
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
||||
self.assertEqual(len(cadaster_list), 14)
|
||||
for cadaster in cadaster_list:
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||
self.assertEqual(len(cadaster_list), 14)
|
||||
for cadaster in cadaster_list:
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_site_lot_is_set(self):
|
||||
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')
|
||||
|
|
|
@ -9,13 +9,13 @@ class ParserXMLTests(unittest.TestCase):
|
|||
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
|
||||
self.assertEqual(len(cadaster_list), 14)
|
||||
for cadaster in cadaster_list:
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
|
||||
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
|
||||
self.assertEqual(len(cadaster_list), 1)
|
||||
for cadaster in cadaster_list:
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -36,7 +36,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
|||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
counter += 1
|
||||
self.assertEqual(counter, 2)
|
||||
|
@ -57,7 +57,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
|||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
|
||||
|
@ -76,7 +76,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
|||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
|
||||
|
@ -96,7 +96,7 @@ class ScrapperXMLTests(unittest.TestCase):
|
|||
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
|
||||
cadaster_entry = CadasterEntryXML(sub_entry)
|
||||
cadaster_entry.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
|
||||
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||
sleep(config['sleep_time'])
|
||||
|
||||
def test_multiparcela_coord_creates_n_entries(self):
|
||||
|
|
|
@ -23,13 +23,13 @@ class CoordinatesSearcherTests(unittest.TestCase):
|
|||
cadaster_list = self.search_random_until_x_times_found_by_html(5)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_search_random_until_1_is_stored_in_elasticsearch(self):
|
||||
cadaster_list = self.search_random_until_x_times_found_by_html(1)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
self.assertTrue(cadaster.from_elasticsearch())
|
||||
|
||||
def test_loading_point_is_in_polygon_returns_true(self):
|
||||
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
import unittest
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
from dotmap import DotMap
|
||||
|
||||
from src.librecatastro.domain.address import Address
|
||||
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
|
||||
|
||||
class ElasticSearchTests(unittest.TestCase):
|
||||
|
||||
def setup_environment(self):
|
||||
ElasticSearchUtils.remove_index()
|
||||
sleep(5)
|
||||
ElasticSearchUtils.create_index()
|
||||
sleep(5)
|
||||
|
||||
def insert_stores_document_in_elasticsearch(self):
|
||||
|
||||
cadaster = DotMap()
|
||||
cadaster.address = Address("CL TESTTEST 17 03005 AJALVIR (MURCIA)")
|
||||
cadaster.cadaster = "AAAAA"
|
||||
cadaster.type = "Urbano"
|
||||
cadaster.use = "Religioso"
|
||||
cadaster.surface = "100m2"
|
||||
cadaster.year = "1970"
|
||||
cadaster.location = None
|
||||
cadaster.gsurface = "1200m2"
|
||||
cadaster.constructions = None
|
||||
cadaster.picture = None
|
||||
cadaster.timestamp = str(datetime.now())
|
||||
|
||||
cadaster_entry = CadasterEntry(cadaster)
|
||||
|
||||
cadaster_entry.to_elasticsearch()
|
||||
|
||||
sleep(5)
|
||||
|
||||
self.assertTrue(cadaster_entry.from_elasticsearch())
|
||||
|
||||
def search_retrieves_document_from_elasticsearch(self):
|
||||
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 17", "AJALVIR", "MURCIA")
|
||||
self.assertTrue(res)
|
||||
|
||||
def search_does_not_retrieve_document_from_elasticsearch(self):
|
||||
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 25", "AJALVIR", "MURCIA")
|
||||
self.assertFalse(res)
|
||||
|
||||
def test_run_tests(self):
|
||||
self.setup_environment()
|
||||
self.insert_stores_document_in_elasticsearch()
|
||||
self.search_retrieves_document_from_elasticsearch()
|
||||
self.search_does_not_retrieve_document_from_elasticsearch()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -1,5 +1,7 @@
|
|||
from dotmap import DotMap
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.settings import config
|
||||
from src.utils.cadastro_logger import CadastroLogger
|
||||
|
||||
logger = CadastroLogger(__name__).logger
|
||||
|
@ -66,26 +68,55 @@ class ElasticSearchUtils:
|
|||
}
|
||||
}
|
||||
logger.debug("Creating 'cadaster' index...")
|
||||
res = es.indices.create(index='cadaster', body=request_body)
|
||||
logger.debug(res)
|
||||
try:
|
||||
res = es.indices.create(index='cadaster', body=request_body)
|
||||
logger.debug(res)
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
|
||||
es.transport.close()
|
||||
|
||||
@staticmethod
|
||||
def remove_index():
|
||||
es = Elasticsearch()
|
||||
logger.debug("Deleting 'cadaster' index...")
|
||||
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
||||
logger.debug(res)
|
||||
try:
|
||||
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
||||
logger.debug(res)
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
|
||||
es.transport.close()
|
||||
|
||||
@staticmethod
|
||||
def create_index_companies():
|
||||
def check_if_address_present(address, city_name, province_name):
|
||||
res = False
|
||||
query = {"query":
|
||||
{"bool":
|
||||
{"must":
|
||||
[{"prefix":
|
||||
{"address.full_address.keyword":"{}".format(address)}},
|
||||
{"match":
|
||||
{"address.province.keyword":"{}".format(province_name)}},
|
||||
{"match":{"address.city.keyword":"{}".format(city_name)}}],
|
||||
"must_not":[],
|
||||
"should":[]}},
|
||||
"from":0,
|
||||
"size":11,
|
||||
"sort":[],
|
||||
"aggs":{}}
|
||||
es = Elasticsearch()
|
||||
request_body = {
|
||||
"settings": {
|
||||
"number_of_shards": 5,
|
||||
"number_of_replicas": 1
|
||||
},
|
||||
}
|
||||
logger.debug("Creating 'borme' index...")
|
||||
res = es.indices.create(index='borme', body=request_body)
|
||||
logger.debug(res)
|
||||
try:
|
||||
res = es.search(config['elasticsearch-index'], config['elasticsearch-doc'], query)
|
||||
hits = DotMap(res).hits.total
|
||||
if hits == DotMap():
|
||||
hits = 0
|
||||
res = (hits > 0)
|
||||
logger.debug("Found in ES: {}".format(hits))
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
|
||||
es.transport.close()
|
||||
|
||||
return res
|
||||
|
||||
|
|
Loading…
Reference in New Issue