Minor updates according to review

mllam · Jan 24, 2025 · 97ee6dd · 97ee6dd
1 parent 0ecfcca
commit 97ee6dd
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -357,7 +357,7 @@ The `inputs` section defines the source datasets to extract data from. Each sour
   - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable.
 - `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the name of the variable to be derived and the value defines a dictionary with the following additional information. See also the 'Derived Variables' section for more details.
   - `function`: the function used to derive a variable. This should be a string with the full namespace of the function, e.g. `mllam_data_prep.ops.derived_variables.physical_field.calculate_toa_radiation`.
-  - `kwargs`: arguments to `function`. This is a dictionary where each key is the named argument to `function` and each value is the input to the function. Here we distinguish between values to be extracted/selected from the input dataset and values supplied by the users themselves. Arguments with values to be extracted from the input dataset need to be prefixed with "input_dataset." to distinguish them from other arguments. See the 'Derived Variables' section for more details.
+  - `kwargs`: arguments to `function`. This is a dictionary where each key is the named argument to `function` and each value is the input to the function. Here we distinguish between values to be extracted/selected from the input dataset and values supplied by the users themselves. Arguments with values to be extracted from the input dataset need to be prefixed with "ds_input." to distinguish them from other arguments. See the 'Derived Variables' section for more details.
 
 #### Derived Variables
 Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the example config file [example.danra.yaml](example.danra.yaml).

diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py
@@ -7,7 +7,7 @@
 
 def check_chunk_size(ds, chunks):
     """
-    Check the chunk size and warn if it exceed CHUNK_MAX_SIZE_WARNING.
+    Check the chunk size and warn if it exceeds CHUNK_MAX_SIZE_WARNING.
 
     Parameters
     ----------
@@ -45,7 +45,7 @@ def check_chunk_size(ds, chunks):
 
 def chunk_dataset(ds, chunks):
     """
-    Check the chunk size and chunk dataset.
+    Check the chunk size and chunk the dataset.
 
     Parameters
     ----------

diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py
@@ -20,7 +20,7 @@
 
 def derive_variable(ds, derived_variable, chunking):
     """
-    Load the dataset, and derive the specified variables
+    Derive a variable using the `function` and `kwargs` of `derived_variable`.
 
     Parameters
     ---------
@@ -118,9 +118,9 @@ def derive_variable(ds, derived_variable, chunking):
         derived_field.attrs.update(derived_field_attrs)
 
         # Return any dropped/reset coordinates
-        derived_field = _return_dropped_coordinates(
-            derived_field, ds_subset, required_coordinates, chunks
-        )
+        for req_coord in required_coordinates:
+            if req_coord in chunks:
+                derived_field.coords[req_coord] = ds_subset[req_coord]
 
         # Align the derived field to the output dataset dimensions (if necessary)
         derived_field = _align_derived_variable(derived_field, ds, target_dims)
@@ -219,34 +219,6 @@ def _check_and_get_required_attributes(field, expected_attributes):
     return attrs
 
 
-def _return_dropped_coordinates(field, ds, required_coordinates, chunks):
-    """
-    Return the coordinates that have been dropped/reset.
-
-    Parameters
-    ----------
-    field: xr.DataArray
-        Derived variable
-    ds: xr.Dataset
-        Dataset with required coordinatwes
-    required_coordinates: List[str]
-        List of coordinates required for the derived variable
-    chunks: Dict[str, int]
-        Dictionary with keys as dimensions to be chunked and
-        chunk sizes as the values
-
-    Returns
-    -------
-    field: xr.DataArray
-        Derived variable, now also with dropped coordinates returned
-    """
-    for req_coord in required_coordinates:
-        if req_coord in chunks:
-            field.coords[req_coord] = ds[req_coord]
-
-    return field
-
-
 def _align_derived_variable(field, ds, target_dims):
     """
     Align a derived variable to the target dimensions (ignoring non-dimension coordinates).

diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py
@@ -1,8 +1,8 @@
 def extract_variable(ds, var_name, coords_to_sample=dict()):
     """
-    Extract specified variable from the provided the input dataset. If
-    coordinates for subsetting are defines, then subset the variable along
-    them and check coordinate units
+    Extract specified variable from the provided input dataset. If
+    coordinates for subsetting are defined, then subset the variable along
+    them and check coordinate units.
 
     Parameters
     ----------