mirror of
https://github.com/josejuanmartinez/libreCatastro.git
synced 2024-07-06 15:22:28 +02:00
Adds first prototype version with fully working tests.
This commit is contained in:
commit
b7a778d55c
21
docker-compose.yml
Normal file
21
docker-compose.yml
Normal file
@ -0,0 +1,21 @@
|
||||
# ./docker-compose.yml
|
||||
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2
|
||||
environment:
|
||||
- cluster.name=docker-cluster
|
||||
- bootstrap.memory_lock=true
|
||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
ports:
|
||||
- "9200:9200"
|
||||
kibana:
|
||||
image: docker.elastic.co/kibana/kibana:6.3.2
|
||||
ports:
|
||||
- "5601:5601"
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
beautifulsoup4==4.8.0
|
||||
elasticsearch>=6.0.0,<7.0.0
|
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
0
src/librecatastro/__init__.py
Normal file
0
src/librecatastro/__init__.py
Normal file
173
src/librecatastro/catastro_scrapper.py
Normal file
173
src/librecatastro/catastro_scrapper.py
Normal file
@ -0,0 +1,173 @@
|
||||
import random
|
||||
import re
|
||||
from time import sleep
|
||||
from urllib.request import urlopen
|
||||
from xml.etree import ElementTree
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.librecatastro.domain.cadaster import Cadaster
|
||||
from src.settings import config
|
||||
from src.utils.ontology_converter import OntologyConverter
|
||||
|
||||
'''Constants'''
|
||||
|
||||
LONGITUDE = (4289603, -18024300) # *1000000
|
||||
LATITUDE = (43769200, 27725500) # *1000000
|
||||
|
||||
SCALE = 1000000
|
||||
TRUNCATE_RIGHT = 4
|
||||
|
||||
MAX = 0
|
||||
MIN = 1
|
||||
|
||||
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4230&Coordenada_X={}&Coordenada_Y={}"
|
||||
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
|
||||
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
|
||||
|
||||
field_names = (u'Referencia catastral', u'Localización', u'Clase', u'Uso principal', u'Superficie construida', u'Año construcción')
|
||||
|
||||
|
||||
class CadastroScrapper:
|
||||
"""Scrapper class for Cadastro Web"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def scrap_all():
|
||||
for j in range(LONGITUDE[MIN], LONGITUDE[MAX]):
|
||||
for i in range(LATITUDE[MIN], LATITUDE[MAX]):
|
||||
CadastroScrapper.scrap_coord(i, j)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_linear(times):
|
||||
results = []
|
||||
counter = times
|
||||
for x in range(LONGITUDE[MIN], LONGITUDE[MAX]):
|
||||
for y in range(LATITUDE[MIN], LATITUDE[MAX]):
|
||||
|
||||
x_scaled = x / SCALE
|
||||
x_scaled = str(x_scaled)[:-TRUNCATE_RIGHT]
|
||||
y_scaled = y / SCALE
|
||||
y_scaled = str(y_scaled)[:-TRUNCATE_RIGHT]
|
||||
result = CadastroScrapper.scrap_coord(x_scaled, y_scaled)
|
||||
if result is not None:
|
||||
results.append(result)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
return
|
||||
sleep(5)
|
||||
|
||||
@staticmethod
|
||||
def scrap_results_random(times):
|
||||
results = []
|
||||
counter = times
|
||||
while counter > 0:
|
||||
x = random.randrange(LONGITUDE[MIN], LONGITUDE[MAX])
|
||||
y = random.randrange(LATITUDE[MIN], LATITUDE[MAX])
|
||||
|
||||
x_scaled = x / SCALE
|
||||
x_scaled = str(x_scaled)[:-TRUNCATE_RIGHT]
|
||||
y_scaled = y / SCALE
|
||||
y_scaled = str(y_scaled)[:-TRUNCATE_RIGHT]
|
||||
cadaster_entry = CadastroScrapper.scrap_coord(x_scaled, y_scaled)
|
||||
|
||||
if cadaster_entry is not None:
|
||||
results.append(cadaster_entry)
|
||||
counter -= 1
|
||||
if counter == 0:
|
||||
break
|
||||
sleep(5)
|
||||
|
||||
#ontology_converter = OntologyConverter()
|
||||
#print(ontology_converter.cadastro_dict_to_ontology(results))
|
||||
print("====PROCESSING FINISHED====")
|
||||
print("Results found: {}".format(times))
|
||||
print(results)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def parse_html_parcela(parsed_html, x=None, y=None):
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
descriptive_data = dict()
|
||||
descriptive_data[u'Longitud'] = x
|
||||
descriptive_data[u'Latitud'] = y
|
||||
''' Parcela '''
|
||||
fields = description.find_all('div')
|
||||
for field in fields:
|
||||
field_header = field.find('span')
|
||||
for field_name in field_names:
|
||||
if field_name in field_header.text:
|
||||
field_value = field.find('label', {"class": "control-label black text-left"})
|
||||
descriptive_data[field_name] = field_value.text.strip()
|
||||
|
||||
if field_header.text == u'Referencia catastral':
|
||||
descriptive_data[field_name] = descriptive_data[field_name].split(' ')[0]
|
||||
descriptive_data[field_name] = descriptive_data[field_name].split('\xa0')[0]
|
||||
elif field_header.text == u'Localización':
|
||||
descriptive_data[field_name] = field_value.encode_contents().decode('utf-8').replace('<br/>',
|
||||
config[
|
||||
'separator']).replace(
|
||||
'<br>', config['separator'])
|
||||
|
||||
cadaster_entry = Cadaster(descriptive_data)
|
||||
print(cadaster_entry.to_json())
|
||||
return cadaster_entry
|
||||
|
||||
@staticmethod
|
||||
def scrap_cadaster_full_code(full_cadaster, x=None, y=None):
|
||||
delimitacion = full_cadaster[0:2]
|
||||
municipio = full_cadaster[2:5]
|
||||
url_ref = URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
|
||||
print("-->FULL URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
return CadastroScrapper.parse_html_parcela(parsed_html, x, y)
|
||||
|
||||
@staticmethod
|
||||
def scrap_cadaster(cadaster, x=None, y=None):
|
||||
rc_1 = cadaster[0:7]
|
||||
rc_2 = cadaster[7:14]
|
||||
url_ref = URL_REF.format(rc_1, rc_2)
|
||||
print("-->URL for cadastral data: {}".format(url_ref))
|
||||
f_ref = urlopen(url_ref)
|
||||
data_ref = f_ref.read()
|
||||
html = str(data_ref.decode('utf-8'))
|
||||
parsed_html = BeautifulSoup(html, features="html.parser")
|
||||
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
|
||||
|
||||
if description is None:
|
||||
print("Multiparcela found!")
|
||||
''' Multiparcela with multiple cadasters '''
|
||||
cadasters = []
|
||||
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
|
||||
print("->Parcelas found: {}".format(len(all_cadasters)))
|
||||
for partial_cadaster in all_cadasters:
|
||||
partial_cadaster_ref = partial_cadaster.find("b")
|
||||
print("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
|
||||
partial_cadaster_text = partial_cadaster_ref.text.strip()
|
||||
cadaster = CadastroScrapper.scrap_cadaster_full_code(partial_cadaster_text, x, y)
|
||||
cadasters.append(cadaster)
|
||||
return cadasters
|
||||
else:
|
||||
return CadastroScrapper.parse_html_parcela(parsed_html, x, y)
|
||||
|
||||
@staticmethod
|
||||
def scrap_coord(x, y):
|
||||
print("====Longitude: {} Latitude: {}====".format(x, y))
|
||||
url = URL.format(x, y)
|
||||
print("-->URL for coordinates: {}".format(url))
|
||||
f = urlopen(url)
|
||||
data = f.read()
|
||||
root = ElementTree.fromstring(data)
|
||||
pc1 = root.find("{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc1")
|
||||
pc2 = root.find("{http://www.catastro.meh.es/}coordenadas//{http://www.catastro.meh.es/}coord//{http://www.catastro.meh.es/}pc//{http://www.catastro.meh.es/}pc2")
|
||||
if pc1 is None or pc2 is None:
|
||||
return None
|
||||
else:
|
||||
print("-->FOUND!")
|
||||
cadaster = ''.join([pc1.text,pc2.text])
|
||||
|
||||
return CadastroScrapper.scrap_cadaster(cadaster, x, y)
|
0
src/librecatastro/domain/__init__.py
Normal file
0
src/librecatastro/domain/__init__.py
Normal file
85
src/librecatastro/domain/address.py
Normal file
85
src/librecatastro/domain/address.py
Normal file
@ -0,0 +1,85 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
from src.settings import config
|
||||
|
||||
|
||||
class Address:
|
||||
def __init__(self, address):
|
||||
self.full_address = address
|
||||
print("Full address: {}", self.full_address)
|
||||
print("Separator: {}", config['separator'])
|
||||
self.first_line = None
|
||||
self.second_line = None
|
||||
self.street = None
|
||||
self.cp = None
|
||||
self.city = None
|
||||
self.province_parentheses = None
|
||||
self.province = None
|
||||
|
||||
self.first_line = self.get_first_line()
|
||||
self.second_line = self.get_second_line()
|
||||
|
||||
self.street = self.get_street()
|
||||
self.cp = self.get_cp()
|
||||
self.province_parentheses, self.province = self.get_province()
|
||||
self.city = self.get_city()
|
||||
|
||||
def get_first_line(self):
|
||||
if self.first_line is not None:
|
||||
return self.first_line
|
||||
second_line = re.search(config['separator'], self.full_address)
|
||||
second_line_span = second_line.span()
|
||||
|
||||
return self.full_address[:second_line_span[0]]
|
||||
|
||||
def get_second_line(self):
|
||||
if self.second_line is not None:
|
||||
return self.second_line
|
||||
|
||||
second_line = re.search(config['separator'], self.full_address)
|
||||
second_line_span = second_line.span()
|
||||
|
||||
return self.full_address[second_line_span[1]:]
|
||||
|
||||
def get_street(self):
|
||||
return self.get_first_line()
|
||||
|
||||
def get_cp(self):
|
||||
if self.cp is not None:
|
||||
return self.cp
|
||||
|
||||
cp_text = None
|
||||
cp = re.search(r'[0-9]{5}', self.get_second_line())
|
||||
|
||||
if cp:
|
||||
cp_span = cp.span()
|
||||
cp_text = self.second_line[cp_span[0]:cp_span[1]]
|
||||
|
||||
return cp_text
|
||||
|
||||
def get_city(self):
|
||||
if self.city is not None:
|
||||
return self.city
|
||||
|
||||
city_text = self.second_line.replace(self.province_parentheses, '')
|
||||
if self.cp is not None:
|
||||
city_text = city_text.replace(self.cp, '')
|
||||
|
||||
return city_text.strip()
|
||||
|
||||
def get_province(self):
|
||||
if self.province_parentheses is not None and self.province is not None:
|
||||
return self.province_parentheses, self.province
|
||||
|
||||
province = re.search(r'\(([^)]+)\)', self.second_line)
|
||||
|
||||
province_span = province.span()
|
||||
province_parentheses_text = self.second_line[province_span[0]:province_span[1]]
|
||||
province_text = province.group(1)
|
||||
|
||||
return province_parentheses_text, province_text
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=lambda o: o.__dict__,
|
||||
sort_keys=True, indent=4)
|
37
src/librecatastro/domain/cadaster.py
Normal file
37
src/librecatastro/domain/cadaster.py
Normal file
@ -0,0 +1,37 @@
|
||||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from src.librecatastro.domain.address import Address
|
||||
from src.librecatastro.domain.location import Location
|
||||
from src.settings import config
|
||||
|
||||
|
||||
class Cadaster:
|
||||
def __init__(self, dict):
|
||||
self.address = Address(dict[u'Localización'])
|
||||
self.cadaster = dict[u'Referencia catastral']
|
||||
self.type = dict[u'Clase'] if u'Clase' in dict else None
|
||||
self.use = dict[u'Uso principal'] if u'Uso principal' in dict else None
|
||||
self.surface = dict[u'Superficie construida'] if u'Superficie construida' in dict else None
|
||||
self.year = dict[u'Año construcción'] if u'Año construcción' in dict else None
|
||||
self.location = Location(dict[u'Longitud'], dict[u'Latitud']) if u'Longitud' in dict and u'Latitud' in dict else None
|
||||
self.timestamp = str(datetime.now())
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=lambda o: o.__dict__,
|
||||
sort_keys=True, indent=4)
|
||||
|
||||
def to_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
res = es.index(index=config['elasticsearch-index'], doc_type='cadaster_doc', id=self.cadaster, body=self.to_json())
|
||||
print(res)
|
||||
return res
|
||||
|
||||
def from_elasticsearch(self):
|
||||
es = Elasticsearch()
|
||||
query = '{"query":{"bool":{"must":[{"match":{"cadaster":"' + self.cadaster + '"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}}'
|
||||
res = es.search(index=config['elasticsearch-index'], body=query)
|
||||
print(res)
|
||||
return res
|
8
src/librecatastro/domain/location.py
Normal file
8
src/librecatastro/domain/location.py
Normal file
@ -0,0 +1,8 @@
|
||||
class Location:
|
||||
def __init__(self, longitude, latitude):
|
||||
self.lon = longitude
|
||||
self.lat = latitude
|
||||
|
||||
def to_json(self):
|
||||
return "{'location': {'lon': {}, 'lat': {}}".format(float(self.lon) if self.lon is not None else None,
|
||||
float(self.lat) if self.lat is not None else None)
|
4
src/settings.py
Normal file
4
src/settings.py
Normal file
@ -0,0 +1,4 @@
|
||||
config = {
|
||||
"separator": "####",
|
||||
"elasticsearch-index": "cadaster"
|
||||
}
|
0
src/templates/__init__.py
Normal file
0
src/templates/__init__.py
Normal file
5
src/templates/individual_address.xml
Normal file
5
src/templates/individual_address.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Address/####ADDRESS####">
|
||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Address"/>
|
||||
<rdfs:label>####ADDRESS####</rdfs:label>
|
||||
<cadaster:located_in rdf:resource="####CITY####"/>
|
||||
</owl:NamedIndividual>
|
4
src/templates/individual_cadaster.xml
Normal file
4
src/templates/individual_cadaster.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####">
|
||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster"/>
|
||||
<rdfs:label>####CADASTER####</rdfs:label>
|
||||
</owl:NamedIndividual>
|
5
src/templates/individual_city.xml
Normal file
5
src/templates/individual_city.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/City/####CITY####">
|
||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/City"/>
|
||||
<rdfs:label>####CITY####</rdfs:label>
|
||||
<cadaster:located_in rdf:resource="####PROVINCE####"/>
|
||||
</owl:NamedIndividual>
|
5
src/templates/individual_coord.xml
Normal file
5
src/templates/individual_coord.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates/####COORDINATES####">
|
||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates"/>
|
||||
<rdfs:label>####COORDINATES####</rdfs:label>
|
||||
<cadaster:located_in rdf:resource="####ADDRESS####"/>
|
||||
</owl:NamedIndividual>
|
5
src/templates/individual_province.xml
Normal file
5
src/templates/individual_province.xml
Normal file
@ -0,0 +1,5 @@
|
||||
<owl:NamedIndividual rdf:about="http://semantic-datahub.taiger.io/ontologies/Province/####PROVINCE####">
|
||||
<rdf:type rdf:resource="http://semantic-datahub.taiger.io/ontologies/Province"/>
|
||||
<rdfs:label>####PROVINCE####</rdfs:label>
|
||||
<cadaster:mentioned_in rdf:resource="http://semantic-datahub.taiger.io/ontologies/Cadaster/####CADASTER####"/>
|
||||
</owl:NamedIndividual>
|
96
src/templates/ontology.owl
Normal file
96
src/templates/ontology.owl
Normal file
@ -0,0 +1,96 @@
|
||||
<?xml version="1.0"?>
|
||||
<rdf:RDF xmlns:owl="http://www.w3.org/2002/07/owl#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:xml="http://www.w3.org/XML/1998/namespace"
|
||||
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
|
||||
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
|
||||
xmlns:skos="http://www.w3.org/2004/02/skos/core#"
|
||||
xmlns:terms="http://purl.org/dc/terms/">
|
||||
<owl:Ontology rdf:about="http://semantic-datahub.taiger.io/ontologies/cadaster">
|
||||
</owl:Ontology>
|
||||
|
||||
|
||||
|
||||
<!--
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Classes
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
-->
|
||||
|
||||
<!-- OUR TOP CLASSES -->
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Thing">
|
||||
<rdfs:label>Thing</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Cadaster">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>Cadaster</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Address">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>Address</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/Province">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>Province</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/City">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>City</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<owl:Class rdf:about="http://semantic-datahub.taiger.io/ontologies/GeoCoordinates">
|
||||
<rdfs:subClassOf rdf:resource="http://semantic-datahub.taiger.io/ontologies/Thing"/>
|
||||
<rdfs:label>Geographical Coordinates</rdfs:label>
|
||||
</owl:Class>
|
||||
|
||||
<!--
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Individuals
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
-->
|
||||
|
||||
|
||||
####INDIVIDUALS####
|
||||
|
||||
|
||||
|
||||
<!--
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Annotation properties
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
-->
|
||||
|
||||
<!-- Left empty -->
|
||||
|
||||
|
||||
|
||||
<!--
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Object Properties
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
-->
|
||||
|
||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/mentioned_in">
|
||||
</owl:ObjectProperty>
|
||||
|
||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/located_in">
|
||||
</owl:ObjectProperty>
|
||||
|
||||
<owl:ObjectProperty rdf:about="http://semantic-datahub.taiger.io/ontologies/registers">
|
||||
</owl:ObjectProperty>
|
||||
|
||||
|
||||
<!-- Here, for each field of the document, if it has a parent... -->
|
||||
</rdf:RDF>
|
0
src/tests/__init__.py
Normal file
0
src/tests/__init__.py
Normal file
83
src/tests/scrapper_tests.py
Normal file
83
src/tests/scrapper_tests.py
Normal file
@ -0,0 +1,83 @@
|
||||
import unittest
|
||||
|
||||
from src.librecatastro.catastro_scrapper import CadastroScrapper
|
||||
from src.utils.elasticsearch_utils import ElasticSearchUtils
|
||||
from src.utils.ontology_converter import OntologyConverter
|
||||
|
||||
|
||||
class MyTestCase(unittest.TestCase):
|
||||
|
||||
def test_remove_index_elasticsearch_works(self):
|
||||
ElasticSearchUtils.remove_index()
|
||||
assert True
|
||||
|
||||
def test_create_index_elasticsearch_works(self):
|
||||
ElasticSearchUtils.create_index()
|
||||
assert True
|
||||
|
||||
def test_coordinate_creates_cadaster(self):
|
||||
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
|
||||
self.assertEqual(cadaster.cadaster, '2302909VK4820A0001GK')
|
||||
|
||||
def test_coordinate_creates_cadaster_and_stores_in_elasticsearch(self):
|
||||
cadaster = CadastroScrapper.scrap_coord(-3.68, 40.47)
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_cadaster_full_creates_cadaster(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('0083101WK2008S0001PD')
|
||||
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
|
||||
self.assertEqual(cadaster.address.province, 'CUENCA')
|
||||
|
||||
def test_cadaster_half_creates_cadaster(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('0183001WK2008S')
|
||||
self.assertEqual(cadaster.address.city, 'ALMONACID DEL MARQUESADO')
|
||||
self.assertEqual(cadaster.address.province, 'CUENCA')
|
||||
|
||||
def test_cadaster_half_creates_cadaster_2(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('21012A03100046')
|
||||
self.assertEqual(cadaster.address.province, 'HUELVA')
|
||||
|
||||
def test_cadaster_no_cp_creates_cadaster(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('06145A00500028')
|
||||
self.assertIsNone(cadaster.address.cp)
|
||||
self.assertEqual(cadaster.address.province, 'BADAJOZ')
|
||||
|
||||
def test_cadaster_multiparcela_returns_list(self):
|
||||
cadaster_list= CadastroScrapper.scrap_cadaster('22282A00900547')
|
||||
self.assertEqual(len(cadaster_list), 2)
|
||||
|
||||
def test_cadaster_is_stored_in_elasticsearch(self):
|
||||
cadaster = CadastroScrapper.scrap_cadaster('0183001WK2008S')
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def scrap_random_until_x_times_found(self, times):
|
||||
cadaster_list = CadastroScrapper.scrap_results_random(times)
|
||||
self.assertEqual(len(cadaster_list), times)
|
||||
return cadaster_list
|
||||
|
||||
def test_scrap_random_until_5_found(self):
|
||||
self.scrap_random_until_x_times_found(5)
|
||||
|
||||
def test_scrap_random_until_5_is_stores_in_elasticsearch(self):
|
||||
cadaster_list = self.scrap_random_until_x_times_found(5)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_scrap_random_until_1_is_stores_in_elasticsearch(self):
|
||||
cadaster_list = self.scrap_random_until_x_times_found(1)
|
||||
for cadaster in cadaster_list:
|
||||
cadaster.to_elasticsearch()
|
||||
self.assertIsNotNone(cadaster.from_elasticsearch())
|
||||
|
||||
def test_create_ontology_with_one_scrap_result(self):
|
||||
ontology_converter = OntologyConverter()
|
||||
results = list()
|
||||
results.append(CadastroScrapper.scrap_coord(-3.68, 40.47))
|
||||
print(ontology_converter.cadastro_dict_to_ontology(results))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
0
src/utils/__init__.py
Normal file
0
src/utils/__init__.py
Normal file
35
src/utils/elasticsearch_utils.py
Normal file
35
src/utils/elasticsearch_utils.py
Normal file
@ -0,0 +1,35 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
|
||||
class ElasticSearchUtils:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def create_index():
|
||||
ElasticSearchUtils.remove_index()
|
||||
es = Elasticsearch()
|
||||
request_body = {
|
||||
"settings": {
|
||||
"number_of_shards": 5,
|
||||
"number_of_replicas": 1
|
||||
},
|
||||
|
||||
"mappings": {
|
||||
"cadaster_doc": {
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "geo_point"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
print("Creating 'cadaster' index...")
|
||||
es.indices.create(index='cadaster', body=request_body)
|
||||
|
||||
@staticmethod
|
||||
def remove_index():
|
||||
es = Elasticsearch()
|
||||
res = es.indices.delete(index='cadaster', ignore=[400, 404])
|
||||
print(res)
|
77
src/utils/ontology_converter.py
Normal file
77
src/utils/ontology_converter.py
Normal file
@ -0,0 +1,77 @@
|
||||
import copy
|
||||
import re
|
||||
|
||||
|
||||
class OntologyConverter:
|
||||
|
||||
def __init__(self):
|
||||
|
||||
with open("../templates/ontology.owl") as ont_f, \
|
||||
open("../templates/individual_city.xml") as ind_city_f, \
|
||||
open("../templates/individual_province.xml") as ind_province_f, \
|
||||
open("../templates/individual_coord.xml") as ind_coord_f, \
|
||||
open("../templates/individual_address.xml") as ind_address_f, \
|
||||
open("../templates/individual_cadaster.xml") as ind_cadaster_f:
|
||||
|
||||
self.ont_template = ont_f.read()
|
||||
self.city_template = ind_city_f.read()
|
||||
self.province_template = ind_province_f.read()
|
||||
self.coord_template = ind_coord_f.read()
|
||||
self.address_template = ind_address_f.read()
|
||||
self.cadaster_template = ind_cadaster_f.read()
|
||||
|
||||
def cadastro_dict_to_ontology(self, cadastro_list):
|
||||
|
||||
ont = copy.deepcopy(self.ont_template)
|
||||
|
||||
for cadastro_entry in cadastro_list:
|
||||
ont = ont.replace("####INDIVIDUALS####", ''.join(["####INDIVIDUALS####",
|
||||
self.instantiate_individual(cadastro_entry)]))
|
||||
|
||||
ont = ont.replace("####INDIVIDUALS####", '')
|
||||
|
||||
return ont
|
||||
|
||||
def instantiate_individual(self, cadastro_entry):
|
||||
individuals = ''
|
||||
|
||||
cadaster = ''
|
||||
for header, value in cadastro_entry.items():
|
||||
if header == 'Referencia catastral':
|
||||
txt = copy.deepcopy(self.cadaster_template)
|
||||
txt = txt.replace("####CADASTER####", value)
|
||||
individuals = ''.join([individuals, txt])
|
||||
cadaster = value
|
||||
elif header == 'Localización':
|
||||
city_txt = copy.deepcopy(self.city_template)
|
||||
province_txt = copy.deepcopy(self.province_template)
|
||||
address_txt = copy.deepcopy(self.address_template)
|
||||
|
||||
cp = re.search(r'[0-9]{5}', value)
|
||||
cp_span = cp.span()
|
||||
cp_span_end = cp_span[1]
|
||||
|
||||
city_text = value[cp_span_end:]
|
||||
province = re.search(r'\(([^\)]+)\)', city_text)
|
||||
province_span = province.span()
|
||||
province_start = province_span[0]
|
||||
province_end = province_span[1]
|
||||
province_text = value[province_start:province_end]
|
||||
|
||||
province_txt = province_txt.replace("####CADASTER####", cadaster)
|
||||
province_txt = province_txt.replace("####PROVINCE####", province_text)
|
||||
|
||||
city_txt = city_txt.replace("####CITY####", city_text)
|
||||
city_txt = city_txt.replace("####PROVINCE####", province_text)
|
||||
|
||||
address_txt = address_txt.replace("####ADDRESS####", value)
|
||||
address_txt = address_txt.replace("####CITY####", city_text)
|
||||
|
||||
individuals = ''.join([individuals, province_txt, city_txt, address_txt])
|
||||
|
||||
print(individuals)
|
||||
return individuals
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user