Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test search_data_catalogs #251

Merged
merged 18 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@ Announcements

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* N/A
* Added the ability to search for simulations that reach a given warming level. (:pull:`251`).

Breaking changes
^^^^^^^^^^^^^^^^
* N/A

Bug fixes
^^^^^^^^^
* N/A
* Fixed a bug in ``xs.search_data_catalogs`` when searching for fixed fields and specific experiments/members. (:pull:`251`).

Internal changes
^^^^^^^^^^^^^^^^
* Continued work on adding tests. (:pull:`251`).
* Fixed pre-commit's pretty-format-json so it ignores notebooks. (:pull:`254`).
* Fixed the labeler so docs/CI isn't automatically added for contributions by new collaborators. (:pull:`254`).
* Made it so that `tests` are no longer treated as an installable package. (:pull:`248`).
Expand Down
6 changes: 4 additions & 2 deletions docs/notebooks/1_catalog.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@
"- `allow_conversion` is used to allow searching for calculable variables, in the case where the requested variable would not be available.\n",
"- `restrict_resolution` is used to limit the results to the finest or coarsest resolution available for each source.\n",
"- `restrict_members` is used to limit the results to a maximum number of realizations for each source.\n",
"- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels.\n",
"- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels. You can also pass a dict to verify that a given warming level is reached.\n",
"\n",
"Note that compared to `search`, the result of `search_data_catalog` is a dictionary with one entry per unique ID. A given unique ID might contain multiple datasets as per `intake-esm`'s definition, because it groups catalog lines per *id - domain - processing_level - xrfreq*. Thus, it would separate model data that exists at different frequencies.\n",
"\n",
Expand Down Expand Up @@ -390,7 +390,9 @@
" data_catalogs=[f\"{Path().absolute()}/samples/pangeo-cmip6.json\"],\n",
" variables_and_freqs=variables_and_freqs,\n",
" match_hist_and_fut=True,\n",
" restrict_warming_level=True, # In this case all models exist in our database, so nothing gets eliminated.\n",
" restrict_warming_level={\n",
" \"wl\": 2\n",
" }, # SSP126 gets eliminated, since it doesn't reach +2°C by 2100.\n",
")\n",
"\n",
"cat_sim"
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def remove_data_folder():
request.addfinalizer(remove_data_folder)


@pytest.mark.requires_docs
@pytest.mark.requires_netcdf
@pytest.fixture(scope="session")
def samplecat():
"""Generate a sample catalog with the tutorial netCDFs."""
Expand Down
262 changes: 262 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,274 @@
from copy import deepcopy

import numpy as np
import pandas as pd
import pytest
from conftest import notebooks
from xclim.testing.helpers import test_timeseries as timeseries

import xscen as xs


class TestSearchDataCatalogs:
cat = xs.DataCatalog(notebooks / "samples" / "pangeo-cmip6.json")

@pytest.mark.parametrize(
"variables_and_freqs, other_arg",
[
({"tasmin": "D"}, None),
({"sftlf": "fx"}, "other"),
({"tasmin": "D", "sftlf": "fx"}, "exclusion"),
],
)
def test_basic(self, variables_and_freqs, other_arg):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs=variables_and_freqs,
other_search_criteria={"experiment": ["ssp585"]}
if other_arg == "other"
else None,
exclusions={"member": "r2.*"} if other_arg == "exclusion" else None,
)
assert len(out) == 13 if other_arg is None else 2 if other_arg == "other" else 6

@pytest.mark.parametrize(
"periods, coverage_kwargs",
[
([["2020", "2030"], ["2035", "2040"]], None),
([["1900", "2030"], ["2035", "2040"]], None),
([["2020", "2130"]], {"coverage": 0.70}),
],
)
def test_periods(self, periods, coverage_kwargs):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
periods=periods,
coverage_kwargs=coverage_kwargs,
)
assert len(out) == (0 if periods[0] == ["1900", "2030"] else 5)

def test_ids(self):
out = xs.search_data_catalogs(
data_catalogs=deepcopy(self.cat),
variables_and_freqs={"tasmin": "D"},
id_columns=["source"],
)
assert len(out) == 3
assert len(out["NorESM2-MM"].df) == 5

@pytest.mark.parametrize("allow_resampling", [True, False])
def test_allow_resampling(self, allow_resampling):
out = xs.search_data_catalogs(
data_catalogs=deepcopy(self.cat),
variables_and_freqs={"tasmin": "YS"},
allow_resampling=allow_resampling,
)
assert len(out) == (13 if allow_resampling else 0)

@pytest.mark.parametrize(
"restrict_warming_level",
[
True,
{"wl": 2, "ignore_member": True},
{"wl": 4},
],
)
def test_warminglevel(self, restrict_warming_level):
cat = deepcopy(self.cat)
new_line = deepcopy(cat.df.iloc[13])
new_line["experiment"] = "ssp245"
new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]
cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)

out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"tasmax": "D"},
restrict_warming_level=restrict_warming_level,
)
if isinstance(restrict_warming_level, bool):
assert len(out) == 5
elif restrict_warming_level == {"wl": 2, "ignore_member": True}:
assert len(out) == 5
elif restrict_warming_level == {"wl": 4}:
assert len(out) == 2

@pytest.mark.parametrize("restrict_resolution", [None, "finest", "coarsest"])
def test_restrict_resolution(self, restrict_resolution):
cat = deepcopy(self.cat)
for i in range(2):
new_line = deepcopy(cat.df.iloc[0])
new_line["mip_era"] = "CMIP5"
new_line["activity"] = "CORDEX"
new_line["institution"] = "CCCma"
new_line["driving_model"] = "CanESM2"
new_line["source"] = "CRCM5"
new_line["experiment"] = "rcp85"
new_line["member"] = "r1i1p1"
new_line["domain"] = "NAM-22" if i == 0 else "NAM-11"
new_line["frequency"] = "day"
new_line["xrfreq"] = "D"
new_line["variable"] = ("tasmin",)
new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]

cat.esmcat._df = pd.concat(
[cat.df, new_line.to_frame().T], ignore_index=True
)

out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["GFDL-CM4", "CRCM5"],
"experiment": ["ssp585", "rcp85"],
},
restrict_resolution=restrict_resolution,
)
if restrict_resolution is None:
assert len(out) == 4
elif restrict_resolution == "finest":
assert len(out) == 2
assert any("NAM-11" in x for x in out)
assert any("_gr1" in x for x in out)
elif restrict_resolution == "coarsest":
assert len(out) == 2
assert any("NAM-22" in x for x in out)
assert any("_gr2" in x for x in out)

@pytest.mark.parametrize("restrict_members", [None, {"ordered": 2}])
def test_restrict_members(self, restrict_members):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["NorESM2-LM"],
"experiment": ["historical"],
},
restrict_members=restrict_members,
)
assert len(out) == (3 if restrict_members is None else 2)
if restrict_members is not None:
assert all(
o in out.keys()
for o in [
"CMIP_NCC_NorESM2-LM_historical_r1i1p1f1_gn",
"CMIP_NCC_NorESM2-LM_historical_r2i1p1f1_gn",
]
)

# Make sure that those with fewer members are still returned
assert (
len(
xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["GFDL-CM4"],
"experiment": ["ssp585"],
"domain": "gr1",
},
restrict_members=restrict_members,
)
)
== 1
)

@pytest.mark.parametrize("allow_conversion", [True, False])
def test_allow_conversion(self, allow_conversion):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"evspsblpot": "D"},
other_search_criteria={
"institution": ["NOAA-GFDL"],
"experiment": ["ssp585"],
},
allow_conversion=allow_conversion,
)
assert len(out) == (2 if allow_conversion else 0)
if allow_conversion:
assert all(
v in out[list(out.keys())[0]].unique("variable")
for v in ["tasmin", "tasmax"]
)
assert "tas" not in out[list(out.keys())[0]].unique("variable")

def test_no_match(self):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tas": "YS"},
allow_resampling=False,
)
assert isinstance(out, dict)
assert len(out) == 0
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tas": "D"},
other_search_criteria={"experiment": "not_real"},
)
assert isinstance(out, dict)
assert len(out) == 0

def test_input_types(self, samplecat):
data_catalogs_2 = notebooks / "samples" / "pangeo-cmip6.json"

assert (
xs.search_data_catalogs(
data_catalogs=[samplecat, data_catalogs_2],
variables_and_freqs={"tas": "D"},
other_search_criteria={
"experiment": "ssp585",
"source": "NorESM.*",
"member": "r1i1p1f1",
},
).keys()
== xs.search_data_catalogs(
data_catalogs=[samplecat, self.cat],
variables_and_freqs={"tas": "D"},
other_search_criteria={
"experiment": "ssp585",
"source": "NorESM.*",
"member": "r1i1p1f1",
},
).keys()
)

def test_match_histfut(self):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={"experiment": "ssp585", "source": "GFDL-CM4"},
match_hist_and_fut=True,
)
k = list(out.keys())[0]
assert str(sorted(out[k].unique("date_start"))[0]) == "1985-01-01 00:00:00"
assert str(sorted(out[k].unique("date_start"))[1]) == "2015-01-01 00:00:00"

def test_fx(self):
cat = deepcopy(self.cat)
new_line = deepcopy(cat.df.iloc[0])
new_line["id"] = new_line["id"].replace(
new_line["experiment"], "another_experiment"
)
new_line["experiment"] = "another_experiment"
cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)

with pytest.warns(
UserWarning,
match="doesn't have the fixed field sftlf, but it can be acquired from ",
):
out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"sftlf": "fx"},
other_search_criteria={"experiment": "another_experiment"},
)
assert len(out) == 1
k = list(out.keys())[0]
np.testing.assert_array_equal(
out[k].df["experiment"],
"another_experiment",
)


class TestGetWarmingLevel:
def test_list(self):
out = xs.get_warming_level(
Expand Down
Loading