Adds list provinces and list cities to main

This commit is contained in:
J 2019-09-20 19:52:24 +02:00
parent d5b280f6eb
commit f186186477
11 changed files with 69 additions and 35 deletions

View File

@ -3,8 +3,8 @@ import argparse
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
from src.librecatastro.scrapping.format.scrapper_xml import ScrapperXML
from src.librecatastro.scrapping.source.coordinates_input import CoordinatesInput
from src.librecatastro.scrapping.source.provinces_input import ProvincesInput
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
from src.librecatastro.scrapping.searchers.provinces_search import ProvincesSearch
from src.settings import config
if __name__ == "__main__":
@ -15,7 +15,10 @@ if __name__ == "__main__":
parser.add_argument('--sleep', action='store', dest='sleep', type=int, default=5)
parser.add_argument('--html', dest='html', default=False, action='store_true')
parser.add_argument('--scale', action='store', dest='scale', type=int, default=10000)
parser.add_argument('--pictures', action='store_true', dest='pictures', default='False')
parser.add_argument('--pictures', action='store_true', dest='pictures', default=False)
parser.add_argument('--startcity', action='store', dest='startcity', default='')
parser.add_argument('--listprovinces', action='store_true', dest='listprovinces', default=False)
parser.add_argument('--listcities', action='store', nargs=1, dest='listcities', default=[])
args = parser.parse_args(sys.argv[1:])
@ -30,8 +33,17 @@ if __name__ == "__main__":
filenames = args.filenames
pictures = args.pictures
provinces = args.provinces
startcity = args.startcity
if args.listprovinces:
ProvincesSearch.list_provinces()
exit(0)
if len(args.listcities) == 1:
ProvincesSearch.list_cities(args.listcities[0])
exit(0)
if args.coords:
CoordinatesInput.scrap_coordinates(scrapper, filenames, pictures)
CoordinatesSearch.scrap_coordinates(scrapper, filenames, pictures)
else:
ProvincesInput.scrap_provinces(scrapper, provinces, pictures)
ProvincesSearch.scrap_provinces(scrapper, provinces, pictures, startcity)

View File

@ -28,7 +28,7 @@ class Address:
self.site = None
self.lot = None
''' NLP search '''
''' NLP searchers '''
self.first_line = self.get_first_line()
self.second_line = self.get_second_line()

View File

@ -60,9 +60,9 @@ class ScrapperHTML(Scrapper):
return results
@classmethod
def scrap_provinces(cls, prov_list, pictures=False):
def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap():
continue
@ -151,7 +151,6 @@ class ScrapperHTML(Scrapper):
parsed_html = BeautifulSoup(html, features="html.parser")
return ScrapperHTML.parse_html_parcela(parsed_html, x, y, picture)
@classmethod
def scrap_cadaster(cls, cadaster, delimitacion=None, municipio=None, x=None, y=None, pictures=False):
rc_1 = cadaster[0:7]

View File

@ -93,8 +93,8 @@ class ScrapperXML(Scrapper):
return results
@classmethod
def scrap_provinces(cls, prov_list, pictures=False):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list):
def scrap_provinces(cls, prov_list, pictures=False, start_from=''):
for prov_name, prov_num, city_name, city_num, address, tv, nv in cls.get_address_iter(prov_list, start_from):
if tv == DotMap() or nv == DotMap():
continue
@ -103,7 +103,7 @@ class ScrapperXML(Scrapper):
while num_scrapping_fails > 0:
try:
cadaster = cls.get_cadaster_by_address(prov_name, city_name, tv, nv, counter)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv)
res = cls.process_xml_by_address(cadaster, prov_name, city_name, tv, nv, pictures)
if len(res) < 1:
num_scrapping_fails -= 1
else:
@ -133,7 +133,7 @@ class ScrapperXML(Scrapper):
sleep(config['sleep_time'])
@classmethod
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv):
def process_xml_by_address(cls, numerero_map, prov_name, city_name, tv, nv, pictures=False):
results = []
if numerero_map.consulta_numerero.lerr.err.cod != DotMap():
return results
@ -177,8 +177,8 @@ class ScrapperXML(Scrapper):
prov_num = entry_map.consulta_dnp.bico.bi.dt.loine.cp
city_num = entry_map.consulta_dnp.bico.bi.dt.loine.cm
if prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
if pictures and prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster_num)
# Parcela
cadaster_entry = CadasterEntryXML(entry_map, lon, lat, picture)
@ -200,7 +200,7 @@ class ScrapperXML(Scrapper):
prov_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cp
city_num = entry_map.consulta_dnp.lrcdnp.rcdnp.loine.cm
if prov_num != DotMap() and city_num != DotMap():
if pictures and prov_num != DotMap() and city_num != DotMap():
picture = cls.scrap_site_picture(prov_num, city_num, cadaster)
cadaster_entry = CadasterEntryXML(sub_entry, lon, lat, picture)

View File

@ -70,7 +70,7 @@ class Scrapper:
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod
def get_address_iter(cls, prov_list=None):
def get_address_iter(cls, prov_list=None, start_from=''):
"""Scraps properties by addresses"""
if prov_list is None:
@ -102,6 +102,10 @@ class Scrapper:
if city_name == DotMap() or city_num == DotMap():
continue
if start_from != '' and city_name != start_from:
logger.debug("Skipping {}".format(city_name))
continue
addresses = cls.get_addresses(prov_name, city_name).consulta_callejero.callejero.calle
if addresses == DotMap():
logger.error("No addresses available right now (Service is down?)")
@ -165,6 +169,7 @@ class Scrapper:
response = requests.get(url, params=params)
xml = response.content
return DotMap(xmltodict.parse(xml, process_namespaces=False, xml_attribs=False))
@classmethod

View File

@ -1,3 +1,3 @@
class Input:
class Search:
def __init__(self):
pass

View File

@ -5,7 +5,7 @@ import random
from time import sleep
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.input import Input
from src.librecatastro.scrapping.search import Search
from src.settings import config
from src.utils.cadastro_logger import CadastroLogger
from src.utils.list_utils import ListUtils
@ -14,12 +14,12 @@ from src.utils.list_utils import ListUtils
logger = CadastroLogger(__name__).logger
class CoordinatesInput(Input):
class CoordinatesSearch(Search):
def __init__(self):
super().__init__()
@classmethod
def scrap_coordinates(cls, scrapper, filenames, pictures):
def scrap_coordinates(cls, scrapper, filenames, pictures=False):
for r, d, files in os.walk(config['coordinates_path']):
for file in files:
@ -31,12 +31,12 @@ class CoordinatesInput(Input):
try:
polygon = GeoPolygon(os.path.join(config['coordinates_path'], file))
CoordinatesInput.scrap_polygon(scrapper, polygon, pictures)
CoordinatesSearch.scrap_polygon(scrapper, polygon, pictures)
except:
logger.error("{} is not formatted properly. Please take a look at the examples.".format(file))
@classmethod
def scrap_polygon(cls, scrapper, polygon, pictures):
def scrap_polygon(cls, scrapper, polygon, pictures=False):
bb = polygon.get_bounding_box()
lon_min = int(bb[0] * config['scale'])
lon_max = int(bb[2] * config['scale'])

View File

@ -0,0 +1,28 @@
import json
from dotmap import DotMap
from src.librecatastro.scrapping.scrapper import Scrapper
from src.librecatastro.scrapping.search import Search
from src.utils.cadastro_logger import CadastroLogger
'''Logger'''
logger = CadastroLogger(__name__).logger
class ProvincesSearch(Search):
def __init__(self):
super().__init__()
@classmethod
def scrap_provinces(cls, scrapper, prov_list, pictures=False, start_from=''):
scrapper.scrap_provinces(prov_list, pictures, start_from)
@classmethod
def list_provinces(cls):
logger.debug(DotMap.pprint(Scrapper.get_provinces()))
return
@classmethod
def list_cities(cls, prov_name):
logger.debug(DotMap.pprint(Scrapper.get_cities(prov_name)))
return

View File

@ -1,10 +0,0 @@
from src.librecatastro.scrapping.input import Input
class ProvincesInput(Input):
def __init__(self):
super().__init__()
@classmethod
def scrap_provinces(cls, scrapper, prov_list, pictures=False):
scrapper.scrap_provinces(prov_list, pictures)

View File

@ -3,7 +3,7 @@ import unittest
from src.librecatastro.domain.geometry.geo_polygon import GeoPolygon
from src.librecatastro.scrapping.format.scrapper_html import ScrapperHTML
from src.librecatastro.scrapping.source.coordinates_input import CoordinatesInput
from src.librecatastro.scrapping.searchers.coordinates_search import CoordinatesSearch
from src.settings import config
from src.utils.elasticsearch_utils import ElasticSearchUtils
@ -89,7 +89,7 @@ class ScrapperHTMLTests(unittest.TestCase):
def scrap_random_until_x_times_found(self, times):
polygon = GeoPolygon(os.path.join(config['coordinates_path'], 'spain_polygon.json'))
coord = polygon.get_bounding_box()
cadaster_list = CoordinatesInput.scrap_results_random_x_times(times, int(coord[0]*config['scale']), int(coord[2]*config['scale']), int(coord[1]*config['scale']), int(coord[3]*config['scale']), ScrapperHTML)
cadaster_list = CoordinatesSearch.scrap_results_random_x_times(times, int(coord[0] * config['scale']), int(coord[2] * config['scale']), int(coord[1] * config['scale']), int(coord[3] * config['scale']), ScrapperHTML)
self.assertTrue(len(cadaster_list) >= times)
return cadaster_list