diff --git a/tests/test_dataset.py b/tests/test_dataset.py index ed838095..35d91121 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,6 +1,7 @@ import pathlib import warnings +import cftime import numpy as np import pytest import xarray as xr @@ -55,59 +56,86 @@ def test_non_cf_compliant_time_is_decoded(self): # Generate an expected dataset with decoded non-CF compliant time units. expected = generate_dataset(cf_compliant=True, has_bounds=True) - expected_time_data = np.array( - [ - "2000-01-01T00:00:00.000000000", - "2000-02-01T00:00:00.000000000", - "2000-03-01T00:00:00.000000000", - "2000-04-01T00:00:00.000000000", - "2000-05-01T00:00:00.000000000", - "2000-06-01T00:00:00.000000000", - "2000-07-01T00:00:00.000000000", - "2000-08-01T00:00:00.000000000", - "2000-09-01T00:00:00.000000000", - "2000-10-01T00:00:00.000000000", - "2000-11-01T00:00:00.000000000", - "2000-12-01T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "2001-02-01T00:00:00.000000000", - "2001-03-01T00:00:00.000000000", - ], - dtype="datetime64[ns]", - ) expected["time"] = xr.DataArray( name="time", - data=expected_time_data, + data=np.array( + [ + cftime.datetime(2000, 1, 1), + cftime.datetime(2000, 2, 1), + cftime.datetime(2000, 3, 1), + cftime.datetime(2000, 4, 1), + cftime.datetime(2000, 5, 1), + cftime.datetime(2000, 6, 1), + cftime.datetime(2000, 7, 1), + cftime.datetime(2000, 8, 1), + cftime.datetime(2000, 9, 1), + cftime.datetime(2000, 10, 1), + cftime.datetime(2000, 11, 1), + cftime.datetime(2000, 12, 1), + cftime.datetime(2001, 1, 1), + cftime.datetime(2001, 2, 1), + cftime.datetime(2001, 3, 1), + ], + ), dims="time", - attrs={ - "units": "months since 2000-01-01", - "calendar": "standard", - "axis": "T", - "long_name": "time", - "standard_name": "time", - "bounds": "time_bnds", - }, ) - expected.time_bnds.data[:] = np.array( - [ - ["1999-12-16T12:00:00.000000000", "2000-01-16T12:00:00.000000000"], - ["2000-01-16T12:00:00.000000000", "2000-02-15T12:00:00.000000000"], - ["2000-02-15T12:00:00.000000000", "2000-03-16T12:00:00.000000000"], - ["2000-03-16T12:00:00.000000000", "2000-04-16T00:00:00.000000000"], - ["2000-04-16T00:00:00.000000000", "2000-05-16T12:00:00.000000000"], - ["2000-05-16T12:00:00.000000000", "2000-06-16T00:00:00.000000000"], - ["2000-06-16T00:00:00.000000000", "2000-07-16T12:00:00.000000000"], - ["2000-07-16T12:00:00.000000000", "2000-08-16T12:00:00.000000000"], - ["2000-08-16T12:00:00.000000000", "2000-09-16T00:00:00.000000000"], - ["2000-09-16T00:00:00.000000000", "2000-10-16T12:00:00.000000000"], - ["2000-10-16T12:00:00.000000000", "2000-11-16T00:00:00.000000000"], - ["2000-11-16T00:00:00.000000000", "2000-12-16T12:00:00.000000000"], - ["2000-12-16T12:00:00.000000000", "2001-01-16T12:00:00.000000000"], - ["2001-01-16T12:00:00.000000000", "2001-02-15T00:00:00.000000000"], - ["2001-02-15T00:00:00.000000000", "2001-03-15T00:00:00.000000000"], - ], - dtype="datetime64[ns]", + expected["time_bnds"] = xr.DataArray( + name="time_bnds", + data=np.array( + [ + [ + cftime.datetime(1999, 12, 16, 12), + cftime.datetime(2000, 1, 16, 12), + ], + [ + cftime.datetime(2000, 1, 16, 12), + cftime.datetime(2000, 2, 15, 12), + ], + [ + cftime.datetime(2000, 2, 15, 12), + cftime.datetime(2000, 3, 16, 12), + ], + [cftime.datetime(2000, 3, 16, 12), cftime.datetime(2000, 4, 16, 0)], + [cftime.datetime(2000, 4, 16, 0), cftime.datetime(2000, 5, 16, 12)], + [cftime.datetime(2000, 5, 16, 12), cftime.datetime(2000, 6, 16, 0)], + [cftime.datetime(2000, 6, 16, 0), cftime.datetime(2000, 7, 16, 12)], + [ + cftime.datetime(2000, 7, 16, 12), + cftime.datetime(2000, 8, 16, 12), + ], + [cftime.datetime(2000, 8, 16, 12), cftime.datetime(2000, 9, 16, 0)], + [ + cftime.datetime(2000, 9, 16, 0), + cftime.datetime(2000, 10, 16, 12), + ], + [ + cftime.datetime(2000, 10, 16, 12), + cftime.datetime(2000, 11, 16, 0), + ], + [ + cftime.datetime(2000, 11, 16, 0), + cftime.datetime(2000, 12, 16, 12), + ], + [ + cftime.datetime(2000, 12, 16, 12), + cftime.datetime(2001, 1, 16, 12), + ], + [cftime.datetime(2001, 1, 16, 12), cftime.datetime(2001, 2, 15, 0)], + [cftime.datetime(2001, 2, 15, 0), cftime.datetime(2001, 3, 15, 0)], + ] + ), + dims=["time", "bnds"], + attrs={"xcdat_bounds": "True"}, ) + + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } expected.time.encoding = { # Set source as result source because it changes every test run. "source": result.time.encoding["source"], @@ -182,67 +210,91 @@ def test_non_cf_compliant_time_is_decoded(self): ds1.to_netcdf(self.file_path1) ds2.to_netcdf(self.file_path2) - result = open_mfdataset( - [self.file_path1, self.file_path2], - data_var="ts", - ) + result = open_mfdataset([self.file_path1, self.file_path2], data_var="ts") - # Generate an expected dataset, which is a combination of both datasets - # with decoded time units and coordinate bounds. + # Generate an expected dataset with decoded non-CF compliant time units. expected = generate_dataset(cf_compliant=True, has_bounds=True) - expected_time_data = np.array( - [ - "2000-01-01T00:00:00.000000000", - "2000-02-01T00:00:00.000000000", - "2000-03-01T00:00:00.000000000", - "2000-04-01T00:00:00.000000000", - "2000-05-01T00:00:00.000000000", - "2000-06-01T00:00:00.000000000", - "2000-07-01T00:00:00.000000000", - "2000-08-01T00:00:00.000000000", - "2000-09-01T00:00:00.000000000", - "2000-10-01T00:00:00.000000000", - "2000-11-01T00:00:00.000000000", - "2000-12-01T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "2001-02-01T00:00:00.000000000", - "2001-03-01T00:00:00.000000000", - ], - dtype="datetime64[ns]", - ) + expected["time"] = xr.DataArray( name="time", - data=expected_time_data, + data=np.array( + [ + cftime.datetime(2000, 1, 1), + cftime.datetime(2000, 2, 1), + cftime.datetime(2000, 3, 1), + cftime.datetime(2000, 4, 1), + cftime.datetime(2000, 5, 1), + cftime.datetime(2000, 6, 1), + cftime.datetime(2000, 7, 1), + cftime.datetime(2000, 8, 1), + cftime.datetime(2000, 9, 1), + cftime.datetime(2000, 10, 1), + cftime.datetime(2000, 11, 1), + cftime.datetime(2000, 12, 1), + cftime.datetime(2001, 1, 1), + cftime.datetime(2001, 2, 1), + cftime.datetime(2001, 3, 1), + ], + ), dims="time", - attrs={ - "units": "months since 2000-01-01", - "calendar": "standard", - "axis": "T", - "long_name": "time", - "standard_name": "time", - "bounds": "time_bnds", - }, ) - expected.time_bnds.data[:] = np.array( - [ - ["1999-12-16T12:00:00.000000000", "2000-01-16T12:00:00.000000000"], - ["2000-01-16T12:00:00.000000000", "2000-02-15T12:00:00.000000000"], - ["2000-02-15T12:00:00.000000000", "2000-03-16T12:00:00.000000000"], - ["2000-03-16T12:00:00.000000000", "2000-04-16T00:00:00.000000000"], - ["2000-04-16T00:00:00.000000000", "2000-05-16T12:00:00.000000000"], - ["2000-05-16T12:00:00.000000000", "2000-06-16T00:00:00.000000000"], - ["2000-06-16T00:00:00.000000000", "2000-07-16T12:00:00.000000000"], - ["2000-07-16T12:00:00.000000000", "2000-08-16T12:00:00.000000000"], - ["2000-08-16T12:00:00.000000000", "2000-09-16T00:00:00.000000000"], - ["2000-09-16T00:00:00.000000000", "2000-10-16T12:00:00.000000000"], - ["2000-10-16T12:00:00.000000000", "2000-11-16T00:00:00.000000000"], - ["2000-11-16T00:00:00.000000000", "2000-12-16T12:00:00.000000000"], - ["2000-12-16T12:00:00.000000000", "2001-01-16T12:00:00.000000000"], - ["2001-01-16T12:00:00.000000000", "2001-02-15T00:00:00.000000000"], - ["2001-02-15T00:00:00.000000000", "2001-03-15T00:00:00.000000000"], - ], - dtype="datetime64[ns]", + expected["time_bnds"] = xr.DataArray( + name="time_bnds", + data=np.array( + [ + [ + cftime.datetime(1999, 12, 16, 12), + cftime.datetime(2000, 1, 16, 12), + ], + [ + cftime.datetime(2000, 1, 16, 12), + cftime.datetime(2000, 2, 15, 12), + ], + [ + cftime.datetime(2000, 2, 15, 12), + cftime.datetime(2000, 3, 16, 12), + ], + [cftime.datetime(2000, 3, 16, 12), cftime.datetime(2000, 4, 16, 0)], + [cftime.datetime(2000, 4, 16, 0), cftime.datetime(2000, 5, 16, 12)], + [cftime.datetime(2000, 5, 16, 12), cftime.datetime(2000, 6, 16, 0)], + [cftime.datetime(2000, 6, 16, 0), cftime.datetime(2000, 7, 16, 12)], + [ + cftime.datetime(2000, 7, 16, 12), + cftime.datetime(2000, 8, 16, 12), + ], + [cftime.datetime(2000, 8, 16, 12), cftime.datetime(2000, 9, 16, 0)], + [ + cftime.datetime(2000, 9, 16, 0), + cftime.datetime(2000, 10, 16, 12), + ], + [ + cftime.datetime(2000, 10, 16, 12), + cftime.datetime(2000, 11, 16, 0), + ], + [ + cftime.datetime(2000, 11, 16, 0), + cftime.datetime(2000, 12, 16, 12), + ], + [ + cftime.datetime(2000, 12, 16, 12), + cftime.datetime(2001, 1, 16, 12), + ], + [cftime.datetime(2001, 1, 16, 12), cftime.datetime(2001, 2, 15, 0)], + [cftime.datetime(2001, 2, 15, 0), cftime.datetime(2001, 3, 15, 0)], + ] + ), + dims=["time", "bnds"], + attrs={"xcdat_bounds": "True"}, ) + + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } expected.time.encoding = { # Set source as result source because it changes every test run. "source": result.time.encoding["source"], @@ -383,7 +435,7 @@ def setup(self): "axis": "T", "long_name": "time", "standard_name": "time", - "calendar": "noleap", + # calendar attr is specified by test. }, ) time_bnds = xr.DataArray( @@ -414,6 +466,8 @@ def test_raises_error_if_function_is_called_on_already_decoded_cf_compliant_data def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-01-01" result = decode_non_cf_time(ds) @@ -422,8 +476,11 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-02-01", "2000-03-01", "2000-04-01"], - dtype="datetime64", + [ + cftime.datetime(2000, 2, 1, calendar=calendar), + cftime.datetime(2000, 3, 1, calendar=calendar), + cftime.datetime(2000, 4, 1, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -432,11 +489,19 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): name="time_bnds", data=np.array( [ - ["2000-01-01", "2000-02-01"], - ["2000-02-01", "2000-03-01"], - ["2000-03-01", "2000-04-01"], + [ + cftime.datetime(2000, 1, 1, calendar=calendar), + cftime.datetime(2000, 2, 1, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 1, calendar=calendar), + cftime.datetime(2000, 3, 1, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 1, calendar=calendar), + cftime.datetime(2000, 4, 1, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -450,7 +515,7 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): "dtype": np.dtype(np.int64), "original_shape": expected.time.data.shape, "units": ds.time.attrs["units"], - "calendar": ds.time.attrs["calendar"], + "calendar": calendar, } expected.time_bnds.encoding = ds.time_bnds.encoding assert result.time.encoding == expected.time.encoding @@ -458,6 +523,8 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-01-15" result = decode_non_cf_time(ds) @@ -466,8 +533,11 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-02-15", "2000-03-15", "2000-04-15"], - dtype="datetime64", + [ + cftime.datetime(2000, 2, 15, calendar=calendar), + cftime.datetime(2000, 3, 15, calendar=calendar), + cftime.datetime(2000, 4, 15, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -476,11 +546,19 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): name="time_bnds", data=np.array( [ - ["2000-01-15", "2000-02-15"], - ["2000-02-15", "2000-03-15"], - ["2000-03-15", "2000-04-15"], + [ + cftime.datetime(2000, 1, 15, calendar=calendar), + cftime.datetime(2000, 2, 15, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 15, calendar=calendar), + cftime.datetime(2000, 3, 15, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 15, calendar=calendar), + cftime.datetime(2000, 4, 15, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -502,6 +580,8 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 1999-12-31" result = decode_non_cf_time(ds) @@ -510,8 +590,11 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-01-31", "2000-02-29", "2000-03-31"], - dtype="datetime64", + [ + cftime.datetime(2000, 1, 31, calendar=calendar), + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 31, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -520,11 +603,19 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): name="time_bnds", data=np.array( [ - ["1999-12-31", "2000-01-31"], - ["2000-01-31", "2000-02-29"], - ["2000-02-29", "2000-03-31"], + [ + cftime.datetime(1999, 12, 31, calendar=calendar), + cftime.datetime(2000, 1, 31, calendar=calendar), + ], + [ + cftime.datetime(2000, 1, 31, calendar=calendar), + cftime.datetime(2000, 2, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 31, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -546,16 +637,22 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): def test_decodes_months_with_a_reference_date_on_a_leap_year(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-02-29" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", data=np.array( - ["2000-03-29", "2000-04-29", "2000-05-29"], - dtype="datetime64", + [ + cftime.datetime(2000, 3, 29, calendar=calendar), + cftime.datetime(2000, 4, 29, calendar=calendar), + cftime.datetime(2000, 5, 29, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -564,11 +661,19 @@ def test_decodes_months_with_a_reference_date_on_a_leap_year(self): name="time_bnds", data=np.array( [ - ["2000-02-29", "2000-03-29"], - ["2000-03-29", "2000-04-29"], - ["2000-04-29", "2000-05-29"], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 29, calendar=calendar), + cftime.datetime(2000, 4, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 4, 29, calendar=calendar), + cftime.datetime(2000, 5, 29, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -590,16 +695,23 @@ def test_decodes_months_with_a_reference_date_on_a_leap_year(self): def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): ds = self.ds.copy() + + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "years since 2000-06-01" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", data=np.array( - ["2001-06-01", "2002-06-01", "2003-06-01"], - dtype="datetime64", + [ + cftime.datetime(2001, 6, 1, calendar=calendar), + cftime.datetime(2002, 6, 1, calendar=calendar), + cftime.datetime(2003, 6, 1, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -608,11 +720,19 @@ def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): name="time_bnds", data=np.array( [ - ["2000-06-01", "2001-06-01"], - ["2001-06-01", "2002-06-01"], - ["2002-06-01", "2003-06-01"], + [ + cftime.datetime(2000, 6, 1, calendar=calendar), + cftime.datetime(2001, 6, 1, calendar=calendar), + ], + [ + cftime.datetime(2001, 6, 1, calendar=calendar), + cftime.datetime(2002, 6, 1, calendar=calendar), + ], + [ + cftime.datetime(2002, 6, 1, calendar=calendar), + cftime.datetime(2003, 6, 1, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -634,36 +754,50 @@ def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): def test_decodes_years_with_a_reference_date_on_a_leap_year(self): ds = self.ds.copy() + + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "years since 2000-02-29" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", - data=[ - np.datetime64("2001-02-28"), - np.datetime64("2002-02-28"), - np.datetime64("2003-02-28"), - ], + data=np.array( + [ + cftime.datetime(2001, 2, 28, calendar=calendar), + cftime.datetime(2002, 2, 28, calendar=calendar), + cftime.datetime(2003, 2, 28, calendar=calendar), + ], + ), dims=["time"], + attrs=ds.time.attrs, ), "time_bnds": xr.DataArray( name="time_bnds", data=np.array( [ - ["2000-02-29", "2001-02-28"], - ["2001-02-28", "2002-02-28"], - ["2002-02-28", "2003-02-28"], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2001, 2, 28, calendar=calendar), + ], + [ + cftime.datetime(2001, 2, 28, calendar=calendar), + cftime.datetime(2002, 2, 28, calendar=calendar), + ], + [ + cftime.datetime(2002, 2, 28, calendar=calendar), + cftime.datetime(2003, 2, 28, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, ), } ) - expected.time.attrs = ds.time.attrs assert result.identical(expected) expected.time.encoding = { @@ -934,23 +1068,27 @@ def callable(ds): expected = ds.copy().isel(time=slice(0, 1)) expected["time"] = xr.DataArray( name="time", - data=np.array( - ["2000-01-01"], - dtype="datetime64", - ), + data=np.array([cftime.datetime(2000, 1, 1)]), dims=["time"], ) expected["time_bnds"] = xr.DataArray( name="time_bnds", data=np.array( - [["1999-12-01", "2000-01-01"]], - dtype="datetime64", + [[cftime.datetime(1999, 12, 1), cftime.datetime(2000, 1, 1)]], ), dims=["time", "bnds"], ) + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } + + expected.time_bnds.attrs = {"xcdat_bounds": "True"} - expected.time.attrs = ds.time.attrs - expected.time_bnds.attrs = ds.time_bnds.attrs assert result.identical(expected) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 75916c75..fc18707d 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -313,13 +313,20 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: time_bounds = ds.get(time.attrs.get("bounds"), None) units_attr = time.attrs.get("units") + # Default to stanadard if no calendar is set per: + # "If the calendar kwarg is set to a blank string (‘’) or None + # (the default is ‘standard’) the instance will not be calendar-aware and + # some methods will not work". + # Source: https://unidata.github.io/cftime/api.html#cftime.datetime + calendar_attr = time.attrs.get("calendar", "standard") + try: units, ref_date = _split_time_units_attr(units_attr) except ValueError: logger.warning( "Attempted to decode non-CF compliant time coordinates, but the units " - f"({units_attr}) is not in a supported format ('months since...' or " - "'months since...'). Returning dataset with existing time coordinates." + f"attr ('{units_attr}') is not in a supported format ('months since...' or " + "'years since...'). Returning dataset with the original time coordinates." ) return ds @@ -329,7 +336,9 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # `rd.relativedelta`. ref_dt_obj = datetime.strptime(ref_date, "%Y-%m-%d") data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] - data = [cftime.datetime(t.year, t.month, t.day) for t in data] + data = [ + cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data + ] decoded_time = xr.DataArray( name=time.name, @@ -356,6 +365,18 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: for [lower, upper] in time_bounds.data ] + data_bounds = [ + [ + cftime.datetime( + lower.year, lower.month, lower.day, calendar=calendar_attr + ), + cftime.datetime( + upper.year, upper.month, upper.day, calendar=calendar_attr + ), + ] + for [lower, upper] in data_bounds + ] + decoded_time_bnds = xr.DataArray( name=time_bounds.name, data=data_bounds, diff --git a/xcdat/dataset.py.7ba32047b9a96b01f204ae0f543fae5f.tmp b/xcdat/dataset.py.7ba32047b9a96b01f204ae0f543fae5f.tmp new file mode 100644 index 00000000..fc18707d --- /dev/null +++ b/xcdat/dataset.py.7ba32047b9a96b01f204ae0f543fae5f.tmp @@ -0,0 +1,680 @@ +"""Dataset module for functions related to an xarray.Dataset.""" +import pathlib +from datetime import datetime +from functools import partial +from glob import glob +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union + +import cftime +import xarray as xr +from dateutil import relativedelta as rd + +from xcdat import bounds # noqa: F401 +from xcdat.axis import center_times as center_times_func +from xcdat.axis import swap_lon_axis +from xcdat.logger import setup_custom_logger + +logger = setup_custom_logger(__name__) + +#: List of non-CF compliant time units. +NON_CF_TIME_UNITS: List[str] = ["months", "years"] + + +def open_dataset( + path: str, + data_var: Optional[str] = None, + add_bounds: bool = True, + decode_times: bool = True, + center_times: bool = False, + lon_orient: Optional[Tuple[float, float]] = None, + **kwargs: Dict[str, Any], +) -> xr.Dataset: + """Wraps ``xarray.open_dataset()`` with post-processing options. + + Parameters + ---------- + path : str + Path to Dataset. + data_var: Optional[str], optional + The key of the non-bounds data variable to keep in the Dataset, + alongside any existing bounds data variables, by default None. + add_bounds: bool, optional + If True, add bounds for supported axes (X, Y, T) if they are missing in + the Dataset, by default True. Bounds are required for many xCDAT + features. + decode_times: bool, optional + If True, attempt to decode times encoded in the standard NetCDF + datetime format into datetime objects. Otherwise, leave them encoded + as numbers. This keyword may not be supported by all the backends, + by default True. + center_times: bool, optional + If True, center time coordinates using the midpoint between its upper + and lower bounds. Otherwise, use the provided time coordinates, by + default False. + lon_orient: Optional[Tuple[float, float]], optional + The orientation to use for the Dataset's longitude axis (if it exists), + by default None. + + Supported options: + + * None: use the current orientation (if the longitude axis exists) + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation + + kwargs : Dict[str, Any] + Additional arguments passed on to ``xarray.open_dataset``. Refer to the + [1]_ xarray docs for accepted keyword arguments. + + Returns + ------- + xr.Dataset + Dataset after applying operations. + + Notes + ----- + ``xarray.open_dataset`` opens the file with read-only access. When you + modify values of a Dataset, even one linked to files on disk, only the + in-memory copy you are manipulating in xarray is modified: the original file + on disk is never touched. + + References + ---------- + + .. [1] https://xarray.pydata.org/en/stable/generated/xarray.open_dataset.html + """ + if decode_times: + cf_compliant_time: Optional[bool] = _has_cf_compliant_time(path) + if cf_compliant_time is False: + # XCDAT handles decoding time values with non-CF units. + ds = xr.open_dataset(path, decode_times=False, **kwargs) + # attempt to decode non-cf-compliant time axis + ds = decode_non_cf_time(ds) + else: + ds = xr.open_dataset(path, decode_times=True, **kwargs) + else: + ds = xr.open_dataset(path, decode_times=False, **kwargs) + + ds = _postprocess_dataset(ds, data_var, center_times, add_bounds, lon_orient) + + return ds + + +def open_mfdataset( + paths: Union[ + str, + pathlib.Path, + List[str], + List[pathlib.Path], + List[List[str]], + List[List[pathlib.Path]], + ], + data_var: Optional[str] = None, + add_bounds: bool = True, + decode_times: bool = True, + center_times: bool = False, + lon_orient: Optional[Tuple[float, float]] = None, + data_vars: Union[Literal["minimal", "different", "all"], List[str]] = "minimal", + preprocess: Optional[Callable] = None, + **kwargs: Dict[str, Any], +) -> xr.Dataset: + """Wraps ``xarray.open_mfdataset()`` with post-processing options. + + Parameters + ---------- + path : Union[str, pathlib.Path, List[str], List[pathlib.Path], \ + List[List[str]], List[List[pathlib.Path]]] + Either a string glob in the form ``"path/to/my/files/*.nc"`` or an + explicit list of files to open. Paths can be given as strings or as + pathlib Paths. If concatenation along more than one dimension is desired, + then ``paths`` must be a nested list-of-lists (see ``combine_nested`` + for details). (A string glob will be expanded to a 1-dimensional list.) + add_bounds: bool, optional + If True, add bounds for supported axes (X, Y, T) if they are missing in + the Dataset, by default True. Bounds are required for many xCDAT + features. + data_var: Optional[str], optional + The key of the data variable to keep in the Dataset, by default None. + decode_times: bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends, by default True. + center_times: bool, optional + If True, center time coordinates using the midpoint between its upper + and lower bounds. Otherwise, use the provided time coordinates, by + default False. + lon_orient: Optional[Tuple[float, float]], optional + The orientation to use for the Dataset's longitude axis (if it exists), + by default None. + + Supported options: + + * None: use the current orientation (if the longitude axis exists) + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation + + data_vars: Union[Literal["minimal", "different", "all"], List[str]], optional + These data variables will be concatenated together: + * "minimal": Only data variables in which the dimension already + appears are included, the default value. + * "different": Data variables which are not equal (ignoring + attributes) across all datasets are also concatenated (as well as + all for which dimension already appears). Beware: this option may + load the data payload of data variables into memory if they are not + already loaded. + * "all": All data variables will be concatenated. + * list of str: The listed data variables will be concatenated, in + addition to the "minimal" data variables. + + The ``data_vars`` kwarg defaults to ``"minimal"``, which concatenates + data variables in a manner where only data variables in which the + dimension already appears are included. For example, the time dimension + will not be concatenated to the dimensions of non-time data variables + such as "lat_bnds" or "lon_bnds". `data_vars="minimal"` is required for + some XCDAT functions, including spatial averaging where a reduction is + performed using the lat/lon bounds. + + preprocess : Optional[Callable], optional + If provided, call this function on each dataset prior to concatenation. + You can find the file-name from which each dataset was loaded in + ``ds.encoding["source"]``. + kwargs : Dict[str, Any] + Additional arguments passed on to ``xarray.open_mfdataset``. Refer to + the [2]_ xarray docs for accepted keyword arguments. + + Returns + ------- + xr.Dataset + The Dataset. + + Notes + ----- + ``xarray.open_mfdataset`` opens the file with read-only access. When you + modify values of a Dataset, even one linked to files on disk, only the + in-memory copy you are manipulating in xarray is modified: the original file + on disk is never touched. + + References + ---------- + + .. [2] https://xarray.pydata.org/en/stable/generated/xarray.open_mfdataset.html + """ + if decode_times: + cf_compliant_time: Optional[bool] = _has_cf_compliant_time(paths) + # XCDAT handles decoding time values with non-CF units using the + # preprocess kwarg. + if cf_compliant_time is False: + decode_times = False + preprocess = partial(_preprocess_non_cf_dataset, callable=preprocess) + + ds = xr.open_mfdataset( + paths, + decode_times=decode_times, + data_vars=data_vars, + preprocess=preprocess, + **kwargs, + ) + ds = _postprocess_dataset(ds, data_var, center_times, add_bounds, lon_orient) + + return ds + + +def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: + """Decodes time coordinates and time bounds with non-CF compliant units. + + By default, ``xarray`` uses the ``cftime`` module, which only supports + decoding time with CF compliant units [3]_. This function fills the gap in + xarray by being able to decode time with non-CF compliant units such as + "months since ..." and "years since ...". + + The steps include: + + 1. Extract ``units`` and ``reference_date`` strings from the "units" + attribute. + + * For example with "months since 1800-01-01", ``units="months"`` and + ``reference_date="1800-01-01"`` + + 2. Using the ``reference_date``, create a datetime object (``ref_dt_obj``). + 3. Starting from ``ref_dt_obj``, use the numerically encoded time coordinate + values (each representing an offset) to create an array of + ``cftime.datetime`` objects. + 4. Using the array of ``cftime.datetime`` objects, create a new xr.DataArray + of time coordinates to replace the numerically encoded ones. + 5. If it exists, create a time bounds DataArray using steps 3 and 4. + + Parameters + ---------- + dataset : xr.Dataset + Dataset with numerically encoded time coordinates and time bounds (if + they exist). If the time coordinates cannot be decoded then the original + dataset is returned. + + Returns + ------- + xr.Dataset + Dataset with decoded time coordinates and time bounds (if they exist) as + datetime objects. + + Notes + ----- + ``cftime.datetime`` objects are used to represent time coordinates because + it is not restricted by the ``pandas.Timestamp`` range (1678 and 2262). + Refer to [4]_ and [5]_ for more information on this limitation. + + References + ----- + .. [3] https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate + .. [4] https://docs.xarray.dev/en/stable/user-guide/weather-climate.html#non-standard-calendars-and-dates-outside-the-timestamp-valid-range + .. [5] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations + + Examples + -------- + + Decode the time coordinates with non-CF units in a Dataset: + + >>> from xcdat.dataset import decode_time_units + >>> ds.time + + array([0, 1, 2]) + Coordinates: + * time (time) int64 0 1 2 + Attributes: + units: years since 2000-01-01 + bounds: time_bnds + axis: T + long_name: time + standard_name: time + calendar: noleap + >>> + >>> ds_decoded = decode_time_units(ds) + >>> ds_decoded.time + + array(['2000-01-01T00:00:00.000000000', '2001-01-01T00:00:00.000000000', + '2002-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + Coordinates: + * time (time) datetime64[ns] 2000-01-01 2001-01-01 2002-01-01 + Attributes: + units: years since 2000-01-01 + bounds: time_bnds + axis: T + long_name: time + standard_name: time + calendar: noleap + + View time encoding information: + + >>> ds_decoded.time.encoding + {'source': None, 'dtype': dtype('int64'), 'original_shape': (3,), 'units': + 'years since 2000-01-01', 'calendar': 'noleap'} + """ + ds = dataset.copy() + + time = ds.cf["T"] + time_bounds = ds.get(time.attrs.get("bounds"), None) + units_attr = time.attrs.get("units") + + # Default to stanadard if no calendar is set per: + # "If the calendar kwarg is set to a blank string (‘’) or None + # (the default is ‘standard’) the instance will not be calendar-aware and + # some methods will not work". + # Source: https://unidata.github.io/cftime/api.html#cftime.datetime + calendar_attr = time.attrs.get("calendar", "standard") + + try: + units, ref_date = _split_time_units_attr(units_attr) + except ValueError: + logger.warning( + "Attempted to decode non-CF compliant time coordinates, but the units " + f"attr ('{units_attr}') is not in a supported format ('months since...' or " + "'years since...'). Returning dataset with the original time coordinates." + ) + return ds + + # Create a reference date object and use the relative delta offsets + # to create `cftime.datetime` objects. Note, the reference date object type + # must start as `datetime` because `cftime` does not support arithmetic with + # `rd.relativedelta`. + ref_dt_obj = datetime.strptime(ref_date, "%Y-%m-%d") + data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] + data = [ + cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data + ] + + decoded_time = xr.DataArray( + name=time.name, + data=data, + dims=time.dims, + coords={time.name: data}, + attrs=time.attrs, + ) + decoded_time.encoding = { + "source": ds.encoding.get("source", "None"), + "dtype": time.dtype, + "original_shape": time.shape, + "units": units_attr, + "calendar": time.attrs.get("calendar", "none"), + } + ds = ds.assign_coords({time.name: decoded_time}) + + if time_bounds is not None: + data_bounds = [ + [ + ref_dt_obj + rd.relativedelta(**{units: lower}), + ref_dt_obj + rd.relativedelta(**{units: upper}), + ] + for [lower, upper] in time_bounds.data + ] + + data_bounds = [ + [ + cftime.datetime( + lower.year, lower.month, lower.day, calendar=calendar_attr + ), + cftime.datetime( + upper.year, upper.month, upper.day, calendar=calendar_attr + ), + ] + for [lower, upper] in data_bounds + ] + + decoded_time_bnds = xr.DataArray( + name=time_bounds.name, + data=data_bounds, + dims=time_bounds.dims, + coords=time_bounds.coords, + attrs=time_bounds.attrs, + ) + decoded_time_bnds.coords[time.name] = decoded_time + decoded_time_bnds.encoding = time_bounds.encoding + ds = ds.assign({time_bounds.name: decoded_time_bnds}) + + return ds + + +def _has_cf_compliant_time( + path: Union[ + str, + pathlib.Path, + List[str], + List[pathlib.Path], + List[List[str]], + List[List[pathlib.Path]], + ] +) -> Optional[bool]: + """Checks if a dataset has time coordinates with CF compliant units. + + If the dataset does not contain a time dimension, None is returned. + Otherwise, the units attribute is extracted from the time coordinates to + determine whether it is CF or non-CF compliant. + + Parameters + ---------- + path : Union[str, pathlib.Path, List[str], List[pathlib.Path], \ + List[List[str]], List[List[pathlib.Path]]] + Either a file (``"file.nc"``), a string glob in the form + ``"path/to/my/files/*.nc"``, or an explicit list of files to open. + Paths can be given as strings or as pathlib Paths. If concatenation + along more than one dimension is desired, then ``paths`` must be a + nested list-of-lists (see ``combine_nested`` for details). (A string + glob will be expanded to a 1-dimensional list.) + + Returns + ------- + Optional[bool] + None if time dimension does not exist, True if CF compliant, or False if + non-CF compliant. + + Notes + ----- + This function only checks one file for multi-file datasets to optimize + performance because it is slower to combine all files then check for CF + compliance. + """ + first_file: Optional[Union[pathlib.Path, str]] = None + + if isinstance(path, str) and "*" in path: + first_file = glob(path)[0] + elif isinstance(path, str) or isinstance(path, pathlib.Path): + first_file = path + elif isinstance(path, list): + if any(isinstance(sublist, list) for sublist in path): + first_file = path[0][0] # type: ignore + else: + first_file = path[0] # type: ignore + + ds = xr.open_dataset(first_file, decode_times=False) + + if ds.cf.dims.get("T") is None: + return None + + time = ds.cf["T"] + + # If the time units attr cannot be split, it is not cf_compliant. + try: + units = _split_time_units_attr(time.attrs.get("units"))[0] + except ValueError: + return False + + cf_compliant = units not in NON_CF_TIME_UNITS + + return cf_compliant + + +def _postprocess_dataset( + dataset: xr.Dataset, + data_var: Optional[str] = None, + center_times: bool = False, + add_bounds: bool = True, + lon_orient: Optional[Tuple[float, float]] = None, +) -> xr.Dataset: + """Post-processes a Dataset object. + + Parameters + ---------- + dataset : xr.Dataset + The dataset. + data_var: Optional[str], optional + The key of the data variable to keep in the Dataset, by default None. + center_times: bool, optional + If True, center time coordinates using the midpoint between its upper + and lower bounds. Otherwise, use the provided time coordinates, by + default False. + add_bounds: bool, optional + If True, add bounds for supported axes (X, Y, T) if they are missing in + the Dataset, by default False. + lon_orient: Optional[Tuple[float, float]], optional + The orientation to use for the Dataset's longitude axis (if it exists), + by default None. + + Supported options: + + * None: use the current orientation (if the longitude axis exists) + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation + + Returns + ------- + xr.Dataset + The Dataset. + + Raises + ------ + ValueError + If ``center_times==True`` but there are no time coordinates. + ValueError + If ``lon_orient is not None`` but there are no longitude coordinates. + """ + if data_var is not None: + dataset = _keep_single_var(dataset, data_var) + + if center_times: + if dataset.cf.dims.get("T") is not None: + dataset = center_times_func(dataset) + else: + raise ValueError("This dataset does not have a time coordinates to center.") + + if add_bounds: + dataset = dataset.bounds.add_missing_bounds() + + if lon_orient is not None: + if dataset.cf.dims.get("X") is not None: + dataset = swap_lon_axis(dataset, to=lon_orient, sort_ascending=True) + else: + raise ValueError( + "This dataset does not have longitude coordinates to reorient." + ) + + return dataset + + +def _keep_single_var(dataset: xr.Dataset, key: str) -> xr.Dataset: + """Keeps a single non-bounds data variable in the Dataset. + + This function checks if the ``data_var`` key exists in the Dataset and + it is not related to bounds. If those checks pass, it will subset the + Dataset to retain that non-bounds ``data_var`` and all bounds data vars. + + Parameters + ---------- + dataset : xr.Dataset + The Dataset. + key: str + The key of the non-bounds data variable to keep in the Dataset. + + Returns + ------- + xr.Dataset + The Dataset. + + Raises + ------ + ValueError + If the dataset only contains bounds data variables. + ValueError + If the specified key does not exist in the dataset. + ValueError + If the specified key matches a bounds data variable. + """ + all_vars = dataset.data_vars.keys() + bounds_vars = dataset.bounds.keys + non_bounds_vars = sorted(list(set(all_vars) ^ set(bounds_vars))) + + if len(non_bounds_vars) == 0: + raise ValueError("This dataset only contains bounds data variables.") + + if key not in all_vars: + raise ValueError(f"The data variable '{key}' does not exist in the dataset.") + + if key in bounds_vars: + raise ValueError("Please specify a non-bounds data variable.") + + return dataset[[key] + bounds_vars] + + +def _get_data_var(dataset: xr.Dataset, key: str) -> xr.DataArray: + """Get a data variable in the Dataset by key. + + Parameters + ---------- + dataset : xr.Dataset + The Dataset. + key : str + The data variable key. + + Returns + ------- + xr.DataArray + The data variable. + + Raises + ------ + KeyError + If the data variable does not exist in the Dataset. + """ + dv = dataset.get(key, None) + + if dv is None: + raise KeyError(f"The data variable '{key}' does not exist in the Dataset.") + + return dv.copy() + + +def _preprocess_non_cf_dataset( + ds: xr.Dataset, callable: Optional[Callable] = None +) -> xr.Dataset: + """Preprocessing for each non-CF compliant dataset in ``open_mfdataset()``. + + This function accepts a user specified preprocess function, which is + executed before additional internal preprocessing functions. + + One call is performed to ``decode_non_cf_time()`` for decoding each + dataset's time coordinates and time bounds (if they exist) with non-CF + compliant units. By default, if ``decode_times=False`` is passed, xarray + will concatenate time values using the first dataset's ``units`` attribute. + This is an issue for cases where the numerically encoded time values are the + same and the ``units`` attribute differs between datasets. + + For example, two files have the same time values, but the units of the first + file is "months since 2000-01-01" and the second is "months since + 2001-01-01". Since the first dataset's units are used in xarray for + concatenating datasets, the time values corresponding to the second file + will be dropped since they appear to be the same as the first file. + + Calling ``decode_non_cf_time()`` on each dataset individually before + concatenating solves the aforementioned issue. + + Parameters + ---------- + ds : xr.Dataset + The Dataset. + callable : Optional[Callable], optional + A user specified optional callable function for preprocessing. + + Returns + ------- + xr.Dataset + The preprocessed Dataset. + """ + ds_new = ds.copy() + + if callable: + ds_new = callable(ds) + + # Attempt to decode non-cf-compliant time axis. + ds_new = decode_non_cf_time(ds_new) + + return ds_new + + +def _split_time_units_attr(units_attr: str) -> Tuple[str, str]: + """Splits the time coordinates' units attr into units and reference date. + + Parameters + ---------- + units_attr : str + The units attribute (e.g., "months since 1800-01-01"). + + Returns + ------- + Tuple[str, str] + The units (e.g, "months") and the reference date (e.g., "1800-01-01"). + + Raises + ------ + KeyError + If the time units attribute was not found. + + ValueError + If the time units attribute is not of the form `X since Y`. + """ + if units_attr is None: + raise KeyError("The dataset's time coordinates does not have a 'units' attr.") + + if "since" in units_attr: + units, reference_date = units_attr.split(" since ") + else: + raise ValueError( + "This dataset does not have time coordinates of the form 'X since Y'." + ) + + return units, reference_date