libreCatastro/src/librecatastro/scrapping/scrappers/scrapper_html.py

100 lines
3.6 KiB
Python

import re
from time import sleep
from urllib.request import urlopen
from bs4 import BeautifulSoup
from src.librecatastro.scrapping.scrapper import Scrapper
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ScrapperHTML(Scrapper):
"""HTML Catastro Scrapper"""
URL = "http://ovc.catastro.meh.es/ovcservweb/ovcswlocalizacionrc/ovccoordenadas.asmx/Consulta_RCCOOR?SRS=EPSG:4226&Coordenada_X={}&Coordenada_Y={}"
URL_REF = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCListaBienes.aspx?rc1={}&rc2={}"
URL_REF_FULL = "https://www1.sedecatastro.gob.es/CYCBienInmueble/OVCConCiud.aspx?RefC={}&RCCompleta={}&del={}&mun={}"
def __init__(self):
super().__init__()
@classmethod
def scrap_coord(cls, x, y):
logger.debug("====Longitude: {} Latitude: {}====".format(x, y))
url = cls.URL.format(x, y)
logger.debug("URL for coordinates: {}".format(url))
f = urlopen(url)
sleep(config['sleep_time'])
return f.read()
@classmethod
def scrap_cadaster_full_code(cls, full_cadaster, delimitacion, municipio):
url_ref = cls.URL_REF_FULL.format(full_cadaster, full_cadaster, delimitacion, municipio)
logger.debug("-->FULL URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
sleep(config['sleep_time'])
return parsed_html
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, pictures=False):
rc_1 = cadaster[0:7]
rc_2 = cadaster[7:14]
url_ref = cls.URL_REF.format(rc_1, rc_2)
logger.debug("URL for cadastral data: {}".format(url_ref))
f_ref = urlopen(url_ref)
data_ref = f_ref.read()
sleep(config['sleep_time'])
html = str(data_ref.decode('utf-8'))
parsed_html = BeautifulSoup(html, features="html.parser")
if delimitacion is None:
delimitacion_search = re.search(r'del=([0-9]+)&', html)
if delimitacion_search:
delimitacion = delimitacion_search.group(1)
if municipio is None:
municipio_search = re.search(r'mun=([0-9]+)&', html)
if municipio_search:
municipio = municipio_search.group(1)
description = parsed_html.find(id='ctl00_Contenido_tblInmueble')
picture = None
if pictures:
picture = cls.scrap_site_picture(delimitacion, municipio, ''.join([rc_1, rc_2]))
sleep(config['sleep_time'])
htmls = []
if description is None:
# Multiparcela
logger.debug("Multiparcela found!")
''' Multiparcela with multiple cadasters '''
all_cadasters = parsed_html.findAll("div", {"id": re.compile('heading[0-9]+')})
logger.debug("->Parcelas found: {}".format(len(all_cadasters)))
for partial_cadaster in all_cadasters:
partial_cadaster_ref = partial_cadaster.find("b")
logger.debug("-->Partial cadaster: {}".format(partial_cadaster_ref.text))
partial_cadaster_text = partial_cadaster_ref.text.strip()
parsed_html = ScrapperHTML.scrap_cadaster_full_code(partial_cadaster_text, delimitacion, municipio)
htmls.append((parsed_html, picture))
sleep(config['sleep_time'])
else:
# Parcela
htmls.append((parsed_html, picture))
return htmls