From 5f55a17d99a0078085d96e55a9f4825dece5ae6f Mon Sep 17 00:00:00 2001 From: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:50:22 +0100 Subject: [PATCH] api-client get_dataset_file (#821) * api_client: introduce get_dataset_file * add test and changlog entry * update tutorial and tests * api_client.py in black --- CHANGELOG.md | 1 + climada/engine/unsequa/test/test_unsequa.py | 7 +- climada/test/test_api_client.py | 10 + climada/test/test_plot.py | 6 +- climada/util/api_client.py | 458 +++++++++++++------- doc/tutorial/climada_util_api_client.ipynb | 29 ++ 6 files changed, 347 insertions(+), 164 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d508439bd..55f11a0d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Code freeze date: YYYY-MM-DD ### Added +- Convenience method `api_client.Client.get_dataset_file`, combining `get_dataset_info` and `download_dataset`, returning a single file objet. [#821](https://github.com/CLIMADA-project/climada_python/pull/821) - Read and Write methods to and from csv files for the `DiscRates` class. [#818](ttps://github.com/CLIMADA-project/climada_python/pull/818) ### Changed diff --git a/climada/engine/unsequa/test/test_unsequa.py b/climada/engine/unsequa/test/test_unsequa.py index cd1912a18..56462d9d9 100755 --- a/climada/engine/unsequa/test/test_unsequa.py +++ b/climada/engine/unsequa/test/test_unsequa.py @@ -40,12 +40,9 @@ TEST_UNC_OUTPUT_IMPACT, TEST_UNC_OUTPUT_COSTBEN) from climada.util.api_client import Client -apiclient = Client() -ds = apiclient.get_dataset_info(name=TEST_UNC_OUTPUT_IMPACT, status='test_dataset') -_target_dir, [test_unc_output_impact] = apiclient.download_dataset(ds) -ds = apiclient.get_dataset_info(name=TEST_UNC_OUTPUT_COSTBEN, status='test_dataset') -_target_dir, [test_unc_output_costben] = apiclient.download_dataset(ds) +test_unc_output_impact = Client().get_dataset_file(name=TEST_UNC_OUTPUT_IMPACT, status='test_dataset') +test_unc_output_costben = Client().get_dataset_file(name=TEST_UNC_OUTPUT_COSTBEN, status='test_dataset') def impf_dem(x_paa=1, x_mdd=1): diff --git a/climada/test/test_api_client.py b/climada/test/test_api_client.py index 9e8b11141..916b2ef95 100644 --- a/climada/test/test_api_client.py +++ b/climada/test/test_api_client.py @@ -213,6 +213,16 @@ def test_get_litpop_fail(self): self.assertIn(" can only query single countries. Download the data for multiple countries individually and concatenate ", str(cm.exception)) + def test_get_dataset_file(self): + client = Client() + with tempfile.TemporaryDirectory() as temp_dir: + single_file = client.get_dataset_file( + name='test_imp_mat', status='test_dataset', # get_dataset_info arguments + target_dir=Path(temp_dir), organize_path=False, # download_dataset arguments + ) + self.assertTrue(single_file.is_file()) + self.assertEqual(list(Path(temp_dir).iterdir()), [single_file]) + def test_multi_filter(self): client = Client() testds = client.list_dataset_infos(data_type='storm_europe') diff --git a/climada/test/test_plot.py b/climada/test/test_plot.py index 888a696ab..607fefdea 100644 --- a/climada/test/test_plot.py +++ b/climada/test/test_plot.py @@ -35,9 +35,9 @@ from climada.util.constants import HAZ_DEMO_MAT, ENT_DEMO_TODAY, TEST_UNC_OUTPUT_COSTBEN from climada.util.api_client import Client -apiclient = Client() -ds = apiclient.get_dataset_info(name=TEST_UNC_OUTPUT_COSTBEN, status='test_dataset') -_target_dir, [test_unc_output_costben] = apiclient.download_dataset(ds) + +test_unc_output_costben = Client().get_dataset_file(name=TEST_UNC_OUTPUT_COSTBEN, status='test_dataset') + class TestPlotter(unittest.TestCase): """Test plot functions.""" diff --git a/climada/util/api_client.py b/climada/util/api_client.py index 9bdf0537c..df9593582 100644 --- a/climada/util/api_client.py +++ b/climada/util/api_client.py @@ -48,6 +48,7 @@ class Download(Model): """Database entry keeping track of downloaded files from the CLIMADA data API""" + url = CharField() path = CharField(unique=True) startdownload = DateTimeField() @@ -55,6 +56,7 @@ class Download(Model): class Meta: """SQL database and table definition.""" + database = DB class Failed(Exception): @@ -66,50 +68,54 @@ class Failed(Exception): @dataclass -class FileInfo(): +class FileInfo: """file data from CLIMADA data API.""" - uuid:str - url:str - file_name:str - file_format:str - file_size:int - check_sum:str + + uuid: str + url: str + file_name: str + file_format: str + file_size: int + check_sum: str @dataclass -class DataTypeInfo(): +class DataTypeInfo: """data type meta data from CLIMADA data API.""" - data_type:str - data_type_group:str + + data_type: str + data_type_group: str status: str - description:str - properties:list # of dict - key_reference:list = None - version_notes:list = None + description: str + properties: list # of dict + key_reference: list = None + version_notes: list = None @dataclass -class DataTypeShortInfo(): +class DataTypeShortInfo: """data type name and group from CLIMADA data API.""" - data_type:str - data_type_group:str + + data_type: str + data_type_group: str @dataclass -class DatasetInfo(): +class DatasetInfo: """dataset data from CLIMADA data API.""" - uuid:str - data_type:DataTypeShortInfo - name:str - version:str - status:str - properties:dict - files:list # of FileInfo - doi:str - description:str + + uuid: str + data_type: DataTypeShortInfo + name: str + version: str + status: str + properties: dict + files: list # of FileInfo + doi: str + description: str license: str - activation_date:str - expiration_date:str + activation_date: str + expiration_date: str @staticmethod def from_json(jsono): @@ -125,8 +131,10 @@ def from_json(jsono): DatasetInfo """ dataset = DatasetInfo(**jsono) - dataset.data_type = DataTypeShortInfo(data_type=dataset.data_type['data_type'], - data_type_group=dataset.data_type['data_type_group']) + dataset.data_type = DataTypeShortInfo( + data_type=dataset.data_type["data_type"], + data_type_group=dataset.data_type["data_type_group"], + ) dataset.files = [FileInfo(uuid=dataset.uuid, **filo) for filo in dataset.files] return dataset @@ -149,8 +157,10 @@ def checksize(local_path, fileinfo): if not local_path.is_file(): raise Download.Failed(f"{str(local_path)} is not a file") if local_path.stat().st_size != fileinfo.file_size: - raise Download.Failed(f"{str(local_path)} has the wrong size:" - f"{local_path.stat().st_size} instead of {fileinfo.file_size}") + raise Download.Failed( + f"{str(local_path)} has the wrong size:" + f"{local_path.stat().st_size} instead of {fileinfo.file_size}" + ) def checkhash(local_path, fileinfo): @@ -171,10 +181,11 @@ def checkhash(local_path, fileinfo): raise NotImplementedError("sanity check by hash sum needs to be implemented yet") -class Cacher(): +class Cacher: """Utility class handling cached results from http requests, to enable the API Client working in offline mode. """ + def __init__(self, cache_enabled): """Constructor of Cacher. @@ -183,15 +194,17 @@ def __init__(self, cache_enabled): cache_enabled : bool, None Default: None, in this case the value is taken from CONFIG.data_api.cache_enabled. """ - self.enabled = (CONFIG.data_api.cache_enabled.bool() - if cache_enabled is None else cache_enabled) + self.enabled = ( + CONFIG.data_api.cache_enabled.bool() + if cache_enabled is None + else cache_enabled + ) self.cachedir = CONFIG.data_api.cache_dir.dir() if self.enabled else None @staticmethod def _make_key(*args, **kwargs): - as_text = '\t'.join( - [str(a) for a in args] + - [f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())] + as_text = "\t".join( + [str(a) for a in args] + [f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())] ) md5h = hashlib.md5() md5h.update(as_text.encode()) @@ -212,7 +225,7 @@ def store(self, result, *args, **kwargs): """ _key = Cacher._make_key(*args, **kwargs) try: - with Path(self.cachedir, _key).open('w', encoding='utf-8') as flp: + with Path(self.cachedir, _key).open("w", encoding="utf-8") as flp: json.dump(result, flp) except (OSError, ValueError): pass @@ -234,15 +247,15 @@ def fetch(self, *args, **kwargs): """ _key = Cacher._make_key(*args, **kwargs) try: - with Path(self.cachedir, _key).open(encoding='utf-8') as flp: + with Path(self.cachedir, _key).open(encoding="utf-8") as flp: return json.load(flp) except (OSError, ValueError): return None -class Client(): - """Python wrapper around REST calls to the CLIMADA data API server. - """ +class Client: + """Python wrapper around REST calls to the CLIMADA data API server.""" + MAX_WAITING_PERIOD = 6 UNLIMITED = 100000 DOWNLOAD_TIMEOUT = 3600 @@ -319,29 +332,36 @@ def _request_200(self, url, params=None): else: # try to restore previous results from an identical request if not self.cache.enabled: - raise Client.NoConnection("there is no internet connection and the client does" - " not cache results.") + raise Client.NoConnection( + "there is no internet connection and the client does" + " not cache results." + ) cached_result = self.cache.fetch(url, **params) if not cached_result: - raise Client.NoConnection("there is no internet connection and the client has not" - " found any cached result for this request.") - LOGGER.warning("there is no internet connection but the client has stored the results" - " of this very request sometime in the past.") + raise Client.NoConnection( + "there is no internet connection and the client has not" + " found any cached result for this request." + ) + LOGGER.warning( + "there is no internet connection but the client has stored the results" + " of this very request sometime in the past." + ) return cached_result - @staticmethod def _divide_straight_from_multi(properties): straights, multis = dict(), dict() for k, _v in properties.items(): if _v is None: - straights[k] = '' + straights[k] = "" elif isinstance(_v, str): straights[k] = _v elif isinstance(_v, list): multis[k] = _v else: - raise ValueError("the value of a property must be a string or a list of strings") + raise ValueError( + "the value of a property must be a string or a list of strings" + ) return straights, multis @staticmethod @@ -351,8 +371,9 @@ def _filter_datasets(datasets, multi_props): pdf = pdf[pdf[prop].isin(selection)] return [datasets[i] for i in pdf.index] - def list_dataset_infos(self, data_type=None, name=None, version=None, properties=None, - status='active'): + def list_dataset_infos( + self, data_type=None, name=None, version=None, properties=None, status="active" + ): """Find all datasets matching the given parameters. Parameters @@ -376,13 +397,13 @@ def list_dataset_infos(self, data_type=None, name=None, version=None, properties ------- list of DatasetInfo """ - url = f'{self.url}/dataset/' + url = f"{self.url}/dataset/" params = { - 'data_type': data_type, - 'name': name, - 'version': version, - 'status': '' if status is None else status, - 'limit': Client.UNLIMITED, + "data_type": data_type, + "name": name, + "version": version, + "status": "" if status is None else status, + "limit": Client.UNLIMITED, } if properties: @@ -393,14 +414,17 @@ def list_dataset_infos(self, data_type=None, name=None, version=None, properties if straight_props: params.update(straight_props) - datasets = [DatasetInfo.from_json(ds) for ds in self._request_200(url, params=params)] + datasets = [ + DatasetInfo.from_json(ds) for ds in self._request_200(url, params=params) + ] if datasets and multi_props: return self._filter_datasets(datasets, multi_props) return datasets - def get_dataset_info(self, data_type=None, name=None, version=None, properties=None, - status='active'): + def get_dataset_info( + self, data_type=None, name=None, version=None, properties=None, status="active" + ): """Find the one dataset that matches the given parameters. Parameters @@ -430,19 +454,30 @@ def get_dataset_info(self, data_type=None, name=None, version=None, properties=N NoResult when there is no dataset matching the search parameters """ - jarr = self.list_dataset_infos(data_type=data_type, name=name, version=version, - properties=properties, status=status) + jarr = self.list_dataset_infos( + data_type=data_type, + name=name, + version=version, + properties=properties, + status=status, + ) if len(jarr) > 1: shown = 10 - endofmessage = '' if len(jarr) <= shown else f'\nand {len(jarr)-shown} more' - datasetlist = ',\n* '.join(str(jarr[i]) for i in range(min(shown, len(jarr)))) - raise Client.AmbiguousResult(f"there are {len(jarr)} datasets meeting the requirements:" - f"\n* {datasetlist}{endofmessage}.") + endofmessage = "" if len(jarr) <= shown else f"\nand {len(jarr)-shown} more" + datasetlist = ",\n* ".join( + str(jarr[i]) for i in range(min(shown, len(jarr))) + ) + raise Client.AmbiguousResult( + f"there are {len(jarr)} datasets meeting the requirements:" + f"\n* {datasetlist}{endofmessage}." + ) if len(jarr) < 1: data_info = self.list_dataset_infos(data_type) properties = self.get_property_values(data_info) - raise Client.NoResult("there is no dataset meeting the requirements, the following" - f" property values are available for {data_type}: {properties}") + raise Client.NoResult( + "there is no dataset meeting the requirements, the following" + f" property values are available for {data_type}: {properties}" + ) return jarr[0] def get_dataset_info_by_uuid(self, uuid): @@ -463,7 +498,7 @@ def get_dataset_info_by_uuid(self, uuid): NoResult if the uuid is not valid """ - url = f'{self.url}/dataset/{uuid}/' + url = f"{self.url}/dataset/{uuid}/" return DatasetInfo.from_json(self._request_200(url)) def list_data_type_infos(self, data_type_group=None): @@ -479,9 +514,8 @@ def list_data_type_infos(self, data_type_group=None): ------- list of DataTypeInfo """ - url = f'{self.url}/data_type/' - params = {'data_type_group': data_type_group} \ - if data_type_group else {} + url = f"{self.url}/data_type/" + params = {"data_type_group": data_type_group} if data_type_group else {} return [DataTypeInfo(**jobj) for jobj in self._request_200(url, params=params)] def get_data_type_info(self, data_type): @@ -501,7 +535,7 @@ def get_data_type_info(self, data_type): NoResult if there is no such data type registered """ - url = f'{self.url}/data_type/{quote(data_type)}/' + url = f"{self.url}/data_type/{quote(data_type)}/" return DataTypeInfo(**self._request_200(url)) def _download(self, url, path, replace=False): @@ -529,34 +563,40 @@ def _download(self, url, path, replace=False): and replace is False """ if path.is_dir(): - path /= unquote(url.split('/')[-1]) + path /= unquote(url.split("/")[-1]) if path.is_file() and not replace: raise FileExistsError(path) with requests.get(url, stream=True, timeout=Client.DOWNLOAD_TIMEOUT) as stream: stream.raise_for_status() - with open(path, 'wb') as dump: + with open(path, "wb") as dump: for chunk in stream.iter_content(chunk_size=self.chunk_size): dump.write(chunk) return path def _tracked_download(self, remote_url, local_path): if local_path.is_dir(): - raise ValueError("tracked download requires a path to a file not a directory") + raise ValueError( + "tracked download requires a path to a file not a directory" + ) path_as_str = str(local_path.absolute()) try: - dlf = Download.create(url=remote_url, - path=path_as_str, - startdownload=datetime.utcnow()) + dlf = Download.create( + url=remote_url, path=path_as_str, startdownload=datetime.utcnow() + ) except IntegrityError as ierr: - dlf = Download.get(Download.path==path_as_str) # path is the table's one unique column + dlf = Download.get( + Download.path == path_as_str + ) # path is the table's one unique column if not Path(path_as_str).is_file(): # in case the file has been removed dlf.delete_instance() # delete entry from database return self._tracked_download(remote_url, local_path) # and try again if dlf.url != remote_url: - raise RuntimeError(f"this file ({path_as_str}) has been downloaded from another" - f" url ({dlf.url}), possibly because it belongs to a dataset with" - " a recent version update. Please remove the file or purge the" - " entry from data base before trying again") from ierr + raise RuntimeError( + f"this file ({path_as_str}) has been downloaded from another" + f" url ({dlf.url}), possibly because it belongs to a dataset with" + " a recent version update. Please remove the file or purge the" + " entry from data base before trying again" + ) from ierr return dlf try: self._download(url=remote_url, path=local_path, replace=True) @@ -565,7 +605,7 @@ def _tracked_download(self, remote_url, local_path): except Exception: dlf.delete_instance() raise - return Download.get(Download.path==path_as_str) + return Download.get(Download.path == path_as_str) def _download_file(self, local_path, fileinfo, check=checksize, retries=3): """Download a file if it is not already present at the target destination. @@ -595,17 +635,21 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3): try: if local_path.is_dir(): local_path /= fileinfo.file_name - downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path) + downloaded = self._tracked_download( + remote_url=fileinfo.url, local_path=local_path + ) if not downloaded.enddownload: - raise Download.Failed(f"A download of {fileinfo.url} via the API Client has been" - " requested before. Either it is still in progress or the" - " process got interrupted. In the former case just wait" - " until the download has finished and try again, in the" - f" latter run `Client.purge_cache_db(Path('{local_path}'))`" - " from Python. If unsure, check your internet connection," - " wait for as long as it takes to download a file of size" - f" {fileinfo.file_size} and try again. If the problem" - " persists, purge the cache db with said call.") + raise Download.Failed( + f"A download of {fileinfo.url} via the API Client has been" + " requested before. Either it is still in progress or the" + " process got interrupted. In the former case just wait" + " until the download has finished and try again, in the" + f" latter run `Client.purge_cache_db(Path('{local_path}'))`" + " from Python. If unsure, check your internet connection," + " wait for as long as it takes to download a file of size" + f" {fileinfo.file_size} and try again. If the problem" + " persists, purge the cache db with said call." + ) try: check(local_path, fileinfo) except Download.Failed as dlf: @@ -617,9 +661,13 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3): if retries < 1: raise dle LOGGER.warning("Download failed: %s, retrying...", dle) - time.sleep(Client.MAX_WAITING_PERIOD/retries) - return self._download_file(local_path=local_path, fileinfo=fileinfo, check=check, - retries=retries - 1) + time.sleep(Client.MAX_WAITING_PERIOD / retries) + return self._download_file( + local_path=local_path, + fileinfo=fileinfo, + check=check, + retries=retries - 1, + ) def download_dataset(self, dataset, target_dir=SYSTEM_DIR, organize_path=True): """Download all files from a given dataset to a given directory. @@ -684,17 +732,24 @@ def purge_cache_db(local_path): fileinfo : FileInfo file object as retrieved from the data api """ - dlf = Download.get(Download.path==str(local_path.absolute())) + dlf = Download.get(Download.path == str(local_path.absolute())) dlf.delete_instance() @staticmethod def _multi_version(datasets): ddf = pd.DataFrame(datasets) - gdf = ddf.groupby('name').agg({'version': 'nunique'}) + gdf = ddf.groupby("name").agg({"version": "nunique"}) return list(gdf[gdf.version > 1].index) - def get_hazard(self, hazard_type, name=None, version=None, properties=None, - status='active', dump_dir=SYSTEM_DIR): + def get_hazard( + self, + hazard_type, + name=None, + version=None, + properties=None, + status="active", + dump_dir=SYSTEM_DIR, + ): """Queries the data api for hazard datasets of the given type, downloads associated hdf5 files and turns them into a climada.hazard.Hazard object. @@ -725,10 +780,17 @@ def get_hazard(self, hazard_type, name=None, version=None, properties=None, The combined hazard object """ if not hazard_type in HAZ_TYPES: - raise ValueError("Valid hazard types are a subset of CLIMADA hazard types." - f" Currently these types are supported: {HAZ_TYPES}") - dataset = self.get_dataset_info(data_type=hazard_type, name=name, version=version, - properties=properties, status=status) + raise ValueError( + "Valid hazard types are a subset of CLIMADA hazard types." + f" Currently these types are supported: {HAZ_TYPES}" + ) + dataset = self.get_dataset_info( + data_type=hazard_type, + name=name, + version=version, + properties=properties, + status=status, + ) return self.to_hazard(dataset, dump_dir) def to_hazard(self, dataset, dump_dir=SYSTEM_DIR): @@ -750,12 +812,15 @@ def to_hazard(self, dataset, dump_dir=SYSTEM_DIR): climada.hazard.Hazard The combined hazard object """ - target_dir = self._organize_path(dataset, dump_dir) \ - if dump_dir == SYSTEM_DIR else dump_dir + target_dir = ( + self._organize_path(dataset, dump_dir) + if dump_dir == SYSTEM_DIR + else dump_dir + ) hazard_list = [ Hazard.from_hdf5(self._download_file(target_dir, dsf)) for dsf in dataset.files - if dsf.file_format == 'hdf5' + if dsf.file_format == "hdf5" ] if not hazard_list: raise ValueError("no hdf5 files found in dataset") @@ -767,8 +832,15 @@ def to_hazard(self, dataset, dump_dir=SYSTEM_DIR): hazard_concat.check() return hazard_concat - def get_exposures(self, exposures_type, name=None, version=None, properties=None, - status='active', dump_dir=SYSTEM_DIR): + def get_exposures( + self, + exposures_type, + name=None, + version=None, + properties=None, + status="active", + dump_dir=SYSTEM_DIR, + ): """Queries the data api for exposures datasets of the given type, downloads associated hdf5 files and turns them into a climada.entity.exposures.Exposures object. @@ -798,10 +870,17 @@ def get_exposures(self, exposures_type, name=None, version=None, properties=None The combined exposures object """ if not exposures_type in EXP_TYPES: - raise ValueError("Valid exposures types are a subset of CLIMADA exposures types." - f" Currently these types are supported: {EXP_TYPES}") - dataset = self.get_dataset_info(data_type=exposures_type, name=name, version=version, - properties=properties, status=status) + raise ValueError( + "Valid exposures types are a subset of CLIMADA exposures types." + f" Currently these types are supported: {EXP_TYPES}" + ) + dataset = self.get_dataset_info( + data_type=exposures_type, + name=name, + version=version, + properties=properties, + status=status, + ) return self.to_exposures(dataset, dump_dir) def to_exposures(self, dataset, dump_dir=SYSTEM_DIR): @@ -823,12 +902,15 @@ def to_exposures(self, dataset, dump_dir=SYSTEM_DIR): climada.entity.exposures.Exposures The combined exposures object """ - target_dir = self._organize_path(dataset, dump_dir) \ - if dump_dir == SYSTEM_DIR else dump_dir + target_dir = ( + self._organize_path(dataset, dump_dir) + if dump_dir == SYSTEM_DIR + else dump_dir + ) exposures_list = [ Exposures.from_hdf5(self._download_file(target_dir, dsf)) for dsf in dataset.files - if dsf.file_format == 'hdf5' + if dsf.file_format == "hdf5" ] if not exposures_list: raise ValueError("no hdf5 files found in dataset") @@ -839,7 +921,9 @@ def to_exposures(self, dataset, dump_dir=SYSTEM_DIR): exposures_concat.check() return exposures_concat - def get_litpop(self, country=None, exponents=(1,1), version=None, dump_dir=SYSTEM_DIR): + def get_litpop( + self, country=None, exponents=(1, 1), version=None, dump_dir=SYSTEM_DIR + ): """Get a LitPop ``Exposures`` instance on a 150arcsec grid with the default parameters: exponents = (1,1) and fin_mode = 'pc'. @@ -876,25 +960,40 @@ def get_litpop(self, country=None, exponents=(1,1), version=None, dump_dir=SYSTE >>> litpop_comb = LitPop.concat([litpop_aut, litpop_che]) """ properties = { - 'exponents': "".join(['(',str(exponents[0]),',',str(exponents[1]),')'])} + "exponents": "".join(["(", str(exponents[0]), ",", str(exponents[1]), ")"]) + } if country is None: - properties['spatial_coverage'] = 'global' + properties["spatial_coverage"] = "global" elif isinstance(country, str): - properties['country_name'] = pycountry.countries.lookup(country).name + properties["country_name"] = pycountry.countries.lookup(country).name elif isinstance(country, list): if len(set(country)) > 1: - raise ValueError("``get_litpop`` can only query single countries. Download the" - " data for multiple countries individually and concatenate the" - " objects using ``LitPop.concat``") - properties['country_name'] = [pycountry.countries.lookup(c).name for c in country] + raise ValueError( + "``get_litpop`` can only query single countries. Download the" + " data for multiple countries individually and concatenate the" + " objects using ``LitPop.concat``" + ) + properties["country_name"] = [ + pycountry.countries.lookup(c).name for c in country + ] else: raise ValueError("country must be string") - return self.get_exposures(exposures_type='litpop', properties=properties, version=version, - dump_dir=dump_dir) + return self.get_exposures( + exposures_type="litpop", + properties=properties, + version=version, + dump_dir=dump_dir, + ) - def get_centroids(self, res_arcsec_land=150, res_arcsec_ocean=1800, - extent=(-180, 180, -60, 60), country=None, version=None, - dump_dir=SYSTEM_DIR): + def get_centroids( + self, + res_arcsec_land=150, + res_arcsec_ocean=1800, + extent=(-180, 180, -60, 60), + country=None, + version=None, + dump_dir=SYSTEM_DIR, + ): """Get centroids from teh API Parameters @@ -922,14 +1021,21 @@ def get_centroids(self, res_arcsec_land=150, res_arcsec_ocean=1800, """ properties = { - 'res_arcsec_land': str(res_arcsec_land), - 'res_arcsec_ocean': str(res_arcsec_ocean), - 'extent': '(-180, 180, -90, 90)' + "res_arcsec_land": str(res_arcsec_land), + "res_arcsec_ocean": str(res_arcsec_ocean), + "extent": "(-180, 180, -90, 90)", } - dataset = self.get_dataset_info('centroids', version=version, properties=properties) - target_dir = self._organize_path(dataset, dump_dir) \ - if dump_dir == SYSTEM_DIR else dump_dir - centroids = Centroids.from_hdf5(self._download_file(target_dir, dataset.files[0])) + dataset = self.get_dataset_info( + "centroids", version=version, properties=properties + ) + target_dir = ( + self._organize_path(dataset, dump_dir) + if dump_dir == SYSTEM_DIR + else dump_dir + ) + centroids = Centroids.from_hdf5( + self._download_file(target_dir, dataset.files[0]) + ) if country: reg_id = pycountry.countries.lookup(country).numeric centroids = centroids.select(reg_id=int(reg_id), extent=extent) @@ -939,8 +1045,9 @@ def get_centroids(self, res_arcsec_land=150, res_arcsec_ocean=1800, return centroids @staticmethod - def get_property_values(dataset_infos, known_property_values=None, - exclude_properties=None): + def get_property_values( + dataset_infos, known_property_values=None, exclude_properties=None + ): """Returns a dictionnary of possible values for properties of a data type, optionally given known property values. @@ -962,7 +1069,7 @@ def get_property_values(dataset_infos, known_property_values=None, of possibles property values """ if exclude_properties is None: - exclude_properties = ['date_creation', 'climada_version'] + exclude_properties = ["date_creation", "climada_version"] ppdf = pd.DataFrame([ds.properties for ds in dataset_infos]) if known_property_values: @@ -996,11 +1103,22 @@ def into_datasets_df(dataset_infos): ppdf = pd.DataFrame([ds.properties for ds in dataset_infos]) dtdf = pd.DataFrame([pd.Series(dt) for dt in dsdf.data_type]) - return dtdf.loc[:, [c for c in dtdf.columns - if c not in ['description', 'properties']]].join( - dsdf.loc[:, [c for c in dsdf.columns - if c not in ['data_type', 'properties', 'files']]]).join( - ppdf) + return ( + dtdf.loc[ + :, [c for c in dtdf.columns if c not in ["description", "properties"]] + ] + .join( + dsdf.loc[ + :, + [ + c + for c in dsdf.columns + if c not in ["data_type", "properties", "files"] + ], + ] + ) + .join(ppdf) + ) @staticmethod def into_files_df(dataset_infos): @@ -1016,8 +1134,9 @@ def into_files_df(dataset_infos): pandas.DataFrame of the files' informations including dataset informations """ - return Client.into_datasets_df(dataset_infos) \ - .merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files])) + return Client.into_datasets_df(dataset_infos).merge( + pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files]) + ) def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True): """Removes downloaded dataset files from the given directory if they have been downloaded @@ -1039,13 +1158,17 @@ def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True): """ # collect urls from datasets that should not be removed - test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else [] + test_datasets = ( + self.list_dataset_infos(status="test_dataset") if keep_testfiles else [] + ) test_urls = set( - file_info.url for ds_info in test_datasets for file_info in ds_info.files) + file_info.url for ds_info in test_datasets for file_info in ds_info.files + ) - active_datasets = self.list_dataset_infos(status='active', version='newest') + active_datasets = self.list_dataset_infos(status="active", version="newest") active_urls = set( - file_info.url for ds_info in active_datasets for file_info in ds_info.files) + file_info.url for ds_info in active_datasets for file_info in ds_info.files + ) not_to_be_removed = test_urls.union(active_urls) @@ -1071,4 +1194,27 @@ def rm_empty_dirs(directory: Path): directory.rmdir() except OSError: # raised when the directory is not empty pass + rm_empty_dirs(target_dir) + + def get_dataset_file(self, **kwargs): + """Convenience method. Combines ``get_dataset`` and ``download_dataset``. + Returns the path to a single file if the dataset has only one, + otherwise throws an error. + + Parameters + ---------- + **kwargs + arguments for get_dataset and download_dataset + + Returns + ------- + Path + """ + download_arguments = { + "target_dir": kwargs.pop("target_dir", SYSTEM_DIR), + "organize_path": kwargs.pop("organize_path", True), + } + dsi = self.get_dataset_info(**kwargs) + _, [test_file] = self.download_dataset(dsi, **download_arguments) + return test_file diff --git a/doc/tutorial/climada_util_api_client.ipynb b/doc/tutorial/climada_util_api_client.ipynb index 9cfaf97bf..f0990768b 100644 --- a/doc/tutorial/climada_util_api_client.ipynb +++ b/doc/tutorial/climada_util_api_client.ipynb @@ -1204,6 +1204,35 @@ "ds_files[0], ds_files[0].is_file()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the dataset contains only one file (which is most commonly the case) this file can also be downloaded and accessed in a single step, using the `get_dataset_file` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WindowsPath('C:/Users/me/climada/data/exposures/litpop/LitPop_pop_150arcsec_SGS/v1/LitPop_pop_150arcsec_SGS.hdf5')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from climada.util.api_client import Client\n", + "Client().get_dataset_file(\n", + " data_type='litpop',\n", + " properties={'country_name': 'South Georgia and the South Sandwich Islands', 'fin_mode': 'pop'})" + ] + }, { "cell_type": "markdown", "metadata": {},