Checks if address already present in ElasticSearch and skips it. Adds ENV var to docker-compose

This commit is contained in:
J 2019-09-23 13:01:05 +02:00
parent c8ec760ef2
commit d38f0905ee
13 changed files with 137 additions and 27 deletions

2
.env Normal file
View File

@ -0,0 +1,2 @@
ELK_DATA=/Users/josejuan.martinez/ELK_DATA
#ELK_DATA=/home/ubuntu/ELK_DATA

View File

@ -10,7 +10,7 @@ services:
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes:
- /home/ubuntu/ELK_data/elasticsearch:/usr/share/elasticsearch/data:rw
- ${ELK_DATA}/elasticsearch:/usr/share/elasticsearch/data:rw
hostname: elasticsearch
ulimits:
memlock:
@ -34,7 +34,7 @@ services:
depends_on:
- elasticsearch
volumes:
- /home/ubuntu/ELK_data/logstash/csv:/shared:rw
- ${ELK_DATA}/logstash/csv:/shared:rw
hostname: logstash
networks:
- ELK

View File

@ -4,6 +4,7 @@
import json
from abc import abstractmethod
from dotmap import DotMap
from elasticsearch import Elasticsearch
from src.settings import config
@ -53,11 +54,15 @@ class CadasterEntry:
return res
def from_elasticsearch(self):
res = None
res = False
es = Elasticsearch()
try:
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
res = es.search(index=config['elasticsearch-index'], body=query)
hits = DotMap(res).hits.total
if hits == DotMap():
hits = 0
res = (hits > 0)
except Exception as e:
logger.error(e)

View File

@ -14,6 +14,7 @@ from src.librecatastro.scrapping.scrappers.scrapper_html import ScrapperHTML
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.elasticsearch_utils import ElasticSearchUtils
'''Logger'''
logger = CadastroLogger(__name__).logger
@ -70,6 +71,11 @@ class ParserHTML(Parser):
if tv == DotMap() or nv == DotMap():
continue
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
prov_name, city_name))
continue
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:

View File

@ -15,6 +15,7 @@ from src.utils.cadastro_logger import CadastroLogger
from dotmap import DotMap
from src.utils.elasticsearch_utils import ElasticSearchUtils
from src.utils.list_utils import ListUtils
'''Logger'''
@ -135,6 +136,11 @@ class ParserXML(Parser):
if tv == DotMap() or nv == DotMap():
continue
if ElasticSearchUtils.check_if_address_present("{} {} {}".format(tv, address, nv), prov_name, city_name):
logger.debug("Skipping {} {} {} {} {} because it's been already scrapped.".format(tv, address, nv,
prov_name, city_name))
continue
num_scrapping_fails = 10
counter = 1
while num_scrapping_fails > 0:

View File

@ -8,6 +8,7 @@ root_path = os.path.dirname(os.path.abspath(__file__))
config = {
"separator": "####",
"elasticsearch-index": "cadaster",
"elasticsearch-doc": "cadaster_doc",
"error_log_file": os.path.join(root_path, 'logs', 'log'),
"tracking_log_file": os.path.join(root_path, 'logs', 'track'),
"scale": 10000,

View File

@ -10,13 +10,13 @@ class ParserHTMLTests(unittest.TestCase):
cadaster_list = ParserHTML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserHTML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
def test_search_site_lot_is_set(self):
cadaster_list = ScrapperHTML.scrap_cadaster('45134A02500003')

View File

@ -9,13 +9,13 @@ class ParserXMLTests(unittest.TestCase):
cadaster_list = ParserXML.process_search_by_coordinates(-3.47600944027389, 40.5370635727521)
self.assertEqual(len(cadaster_list), 14)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
def test_search_by_provinces_creates_and_stores_in_elasticsearch(self):
cadaster_list = ParserXML.process_search_by_provinces(['MADRID'], max_times=1)
self.assertEqual(len(cadaster_list), 1)
for cadaster in cadaster_list:
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
if __name__ == '__main__':

View File

@ -36,7 +36,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
counter += 1
self.assertEqual(counter, 2)
@ -57,7 +57,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch(self):
@ -76,7 +76,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_no_es_pt_pu_creates_entry_in_elasticsearch_2(self):
@ -96,7 +96,7 @@ class ScrapperXMLTests(unittest.TestCase):
sub_entry = ScrapperXML.get_cadaster_entries_by_cadaster(prov_name, city_name, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry)
cadaster_entry.to_elasticsearch()
self.assertIsNotNone(cadaster_entry.from_elasticsearch())
self.assertTrue(cadaster_entry.from_elasticsearch())
sleep(config['sleep_time'])
def test_multiparcela_coord_creates_n_entries(self):

View File

@ -23,13 +23,13 @@ class CoordinatesSearcherTests(unittest.TestCase):
cadaster_list = self.search_random_until_x_times_found_by_html(5)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
def test_search_random_until_1_is_stored_in_elasticsearch(self):
cadaster_list = self.search_random_until_x_times_found_by_html(1)
for cadaster in cadaster_list:
cadaster.to_elasticsearch()
self.assertIsNotNone(cadaster.from_elasticsearch())
self.assertTrue(cadaster.from_elasticsearch())
def test_loading_point_is_in_polygon_returns_true(self):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))

View File

View File

@ -0,0 +1,59 @@
import unittest
from datetime import datetime
from time import sleep
from dotmap import DotMap
from src.librecatastro.domain.address import Address
from src.librecatastro.domain.cadaster_entry.cadaster_entry import CadasterEntry
from src.utils.elasticsearch_utils import ElasticSearchUtils
class ElasticSearchTests(unittest.TestCase):
def setup_environment(self):
ElasticSearchUtils.remove_index()
sleep(5)
ElasticSearchUtils.create_index()
sleep(5)
def insert_stores_document_in_elasticsearch(self):
cadaster = DotMap()
cadaster.address = Address("CL TESTTEST 17 03005 AJALVIR (MURCIA)")
cadaster.cadaster = "AAAAA"
cadaster.type = "Urbano"
cadaster.use = "Religioso"
cadaster.surface = "100m2"
cadaster.year = "1970"
cadaster.location = None
cadaster.gsurface = "1200m2"
cadaster.constructions = None
cadaster.picture = None
cadaster.timestamp = str(datetime.now())
cadaster_entry = CadasterEntry(cadaster)
cadaster_entry.to_elasticsearch()
sleep(5)
self.assertTrue(cadaster_entry.from_elasticsearch())
def search_retrieves_document_from_elasticsearch(self):
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 17", "AJALVIR", "MURCIA")
self.assertTrue(res)
def search_does_not_retrieve_document_from_elasticsearch(self):
res = ElasticSearchUtils.check_if_address_present("CL TESTTEST 25", "AJALVIR", "MURCIA")
self.assertFalse(res)
def test_run_tests(self):
self.setup_environment()
self.insert_stores_document_in_elasticsearch()
self.search_retrieves_document_from_elasticsearch()
self.search_does_not_retrieve_document_from_elasticsearch()
if __name__ == '__main__':
unittest.main()

View File

@ -1,5 +1,7 @@
from dotmap import DotMap
from elasticsearch import Elasticsearch
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
logger = CadastroLogger(__name__).logger
@ -66,26 +68,55 @@ class ElasticSearchUtils:
}
}
logger.debug("Creating 'cadaster' index...")
res = es.indices.create(index='cadaster', body=request_body)
logger.debug(res)
try:
res = es.indices.create(index='cadaster', body=request_body)
logger.debug(res)
except Exception as e:
logger.debug(e)
es.transport.close()
@staticmethod
def remove_index():
es = Elasticsearch()
logger.debug("Deleting 'cadaster' index...")
res = es.indices.delete(index='cadaster', ignore=[400, 404])
logger.debug(res)
try:
res = es.indices.delete(index='cadaster', ignore=[400, 404])
logger.debug(res)
except Exception as e:
logger.debug(e)
es.transport.close()
@staticmethod
def create_index_companies():
def check_if_address_present(address, city_name, province_name):
res = False
query = {"query":
{"bool":
{"must":
[{"prefix":
{"address.full_address.keyword":"{}".format(address)}},
{"match":
{"address.province.keyword":"{}".format(province_name)}},
{"match":{"address.city.keyword":"{}".format(city_name)}}],
"must_not":[],
"should":[]}},
"from":0,
"size":11,
"sort":[],
"aggs":{}}
es = Elasticsearch()
request_body = {
"settings": {
"number_of_shards": 5,
"number_of_replicas": 1
},
}
logger.debug("Creating 'borme' index...")
res = es.indices.create(index='borme', body=request_body)
logger.debug(res)
try:
res = es.search(config['elasticsearch-index'], config['elasticsearch-doc'], query)
hits = DotMap(res).hits.total
if hits == DotMap():
hits = 0
res = (hits > 0)
logger.debug("Found in ES: {}".format(hits))
except Exception as e:
logger.debug(e)
es.transport.close()
return res