Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add add_location warnings #1295

Closed
117 changes: 117 additions & 0 deletions add_location_nan_zero_warnings.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Pip install necessary libraries\n",
"#%pip install echopype s3fs boto3==1.34.51 numpy==1.24.4 xarray==2022.12.0"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"import s3fs\n",
"import boto3\n",
"import echopype as ep\n",
"from botocore import UNSIGNED\n",
"from botocore.config import Config"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Grab from S3 and parse EK60 \n",
"s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))\n",
"s3_file_system = s3fs.S3FileSystem(anon=True)\n",
"\n",
"bucket_name = 'noaa-wcsd-pds'\n",
"ship_name = 'Albatross_Iv'\n",
"cruise_name = 'AL0403'\n",
"sensor_name = 'EK60'\n",
"file_name = \"L0010-D20040416-T094042-EK60.raw\"\n",
"\n",
"raw_file_s3_path = f\"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}\"\n",
"echodata = ep.open_raw(raw_file_s3_path, sonar_model=sensor_name, use_swap=True, storage_options={'anon': True})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Latitude Values Pre Interp (lowest 5): [ 0. 0. 43.68361 43.68362333 43.68362333]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-04 21:06:21,791:echopype.consolidate.api:WARNING: The echodata[\"Platform\"][\"latitude\"] array contains zeros. Interpolation may be negatively impacted, so the user should handle these values.\n",
"2024-04-04 21:06:21,792:echopype.consolidate.api:WARNING: The echodata[\"Platform\"][\"longitude\"] array contains zeros. Interpolation may be negatively impacted, so the user should handle these values.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Latitude Values Post Interp (lowest 5): [19.57221436 23.97964969 43.68361412 43.68363238 43.68365699]\n"
]
}
],
"source": [
"# Turn on Echopype Verbosity\n",
"ep.utils.log.verbose(override=False)\n",
"\n",
"# Print out pre interpolated latitudes\n",
"latitude = echodata.platform.latitude.values\n",
"latitude_values = latitude.copy()\n",
"latitude_values.sort()\n",
"print(\"Latitude Values Pre Interp (lowest 5):\", latitude_values[:5])\n",
"\n",
"ds_sv = ep.calibrate.compute_Sv(echodata)\n",
"ds_sv_location = ep.consolidate.add_location(ds_sv, echodata)\n",
"\n",
"# Print out post interpolated latitudes\n",
"latitude_2 = ds_sv_location.latitude.values\n",
"latitude_values_2 = latitude_2.copy()\n",
"latitude_values_2.sort()\n",
"print(\"Latitude Values Post Interp (lowest 5):\", latitude_values_2[:5])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "echopype",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
18 changes: 18 additions & 0 deletions echopype/consolidate/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
from ..echodata import EchoData
from ..echodata.simrad import retrieve_correct_beam_group
from ..utils.io import validate_source_ds_da
from ..utils.log import _init_logger
from ..utils.prov import add_processing_level
from .split_beam_angle import add_angle_to_ds, get_angle_complex_samples, get_angle_power_samples

POSITION_VARIABLES = ["latitude", "longitude"]

logger = _init_logger(__name__)


def swap_dims_channel_frequency(ds: xr.Dataset) -> xr.Dataset:
"""
Expand Down Expand Up @@ -177,6 +180,21 @@ def sel_interp(var, time_dim_name):
if "longitude" not in echodata["Platform"] or echodata["Platform"]["longitude"].isnull().all():
raise ValueError("Coordinate variables not present or all nan")

# Check if any latitude/longitude value is NaN/0
contains_nan_lat = np.isnan(echodata["Platform"]["latitude"].values).any()
contains_nan_lon = np.isnan(echodata["Platform"]["longitude"].values).any()
contains_zero_lat = (echodata["Platform"]["latitude"].values == 0).any()
contains_zero_lon = (echodata["Platform"]["longitude"].values == 0).any()
interp_msg = "Interpolation may be negatively impacted, so the user should handle these values."
if contains_nan_lat:
logger.warning(f'The echodata["Platform"]["latitude"] array contains NaNs. {interp_msg}')
if contains_nan_lon:
logger.warning(f'The echodata["Platform"]["longitude"] array contains NaNs. {interp_msg}')
if contains_zero_lat:
logger.warning(f'The echodata["Platform"]["latitude"] array contains zeros. {interp_msg}')
if contains_zero_lon:
logger.warning(f'The echodata["Platform"]["longitude"] array contains zeros. {interp_msg}')

interp_ds = ds.copy()
time_dim_name = list(echodata["Platform"]["longitude"].dims)[0]
interp_ds["latitude"] = sel_interp("latitude", time_dim_name)
Expand Down
127 changes: 78 additions & 49 deletions echopype/mask/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,26 @@ def _validate_and_collect_mask_input(
# the coordinate sequence matters, so fix the tuple form
allowed_dims = [
("ping_time", "range_sample"),
("ping_time", "depth"),
("channel", "ping_time", "range_sample"),
("channel", "ping_time", "depth"),
]
if mask[mask_ind].dims not in allowed_dims:
raise ValueError("All masks must have dimensions ('ping_time', 'range_sample')!")
raise ValueError(
"Masks must have one of the following dimensions: "
"('ping_time', 'range_sample'), ('ping_time', 'depth'), "
"('channel', 'ping_time', 'range_sample'), "
"('channel', 'ping_time', 'depth')"
)

# Check for the channel dimension consistency
channel_dim_shapes = set()
for mask_indiv in mask:
if "channel" in mask_indiv.dims:
for mask_chan_ind in range(len(mask_indiv["channel"])):
channel_dim_shapes.add(mask_indiv.isel(channel=mask_chan_ind).shape)
if len(channel_dim_shapes) > 1:
raise ValueError("All masks must have the same shape in the 'channel' dimension.")

else:
if not isinstance(storage_options_mask, dict):
Expand All @@ -128,7 +144,7 @@ def _validate_and_collect_mask_input(


def _check_var_name_fill_value(
source_ds: xr.Dataset, var_name: str, fill_value: Union[int, float, np.ndarray, xr.DataArray]
source_ds: xr.Dataset, var_name: str, fill_value: Union[int, float, xr.DataArray]
) -> Union[int, float, np.ndarray, xr.DataArray]:
"""
Ensures that the inputs ``var_name`` and ``fill_value`` for the function
Expand All @@ -140,12 +156,12 @@ def _check_var_name_fill_value(
A Dataset that contains the variable ``var_name``
var_name: str
The variable name in ``source_ds`` that the mask should be applied to
fill_value: int or float or np.ndarray or xr.DataArray
fill_value: int, float, or xr.DataArray
Specifies the value(s) at false indices

Returns
-------
fill_value: int or float or np.ndarray or xr.DataArray
fill_value: int, float, or xr.DataArray
fill_value with sanitized dimensions

Raises
Expand All @@ -167,17 +183,12 @@ def _check_var_name_fill_value(
raise ValueError("The Dataset source_ds does not contain the variable var_name!")

# check the type of fill_value
if not isinstance(fill_value, (int, float, np.ndarray, xr.DataArray)):
raise TypeError(
"The input fill_value must be of type int or " "float or np.ndarray or xr.DataArray!"
)
if not isinstance(fill_value, (int, float, xr.DataArray)):
raise TypeError("The input fill_value must be of type int, float, or xr.DataArray!")

# make sure that fill_values is the same shape as var_name
if isinstance(fill_value, (np.ndarray, xr.DataArray)):
if isinstance(fill_value, xr.DataArray):
fill_value = fill_value.data.squeeze() # squeeze out length=1 channel dimension
elif isinstance(fill_value, np.ndarray):
fill_value = fill_value.squeeze() # squeeze out length=1 channel dimension
if isinstance(fill_value, xr.DataArray):
fill_value = fill_value.data.squeeze() # squeeze out length=1 channel dimension

source_ds_shape = (
source_ds[var_name].isel(channel=0).shape
Expand Down Expand Up @@ -248,37 +259,50 @@ def apply_mask(
source_ds: Union[xr.Dataset, str, pathlib.Path],
mask: Union[xr.DataArray, str, pathlib.Path, List[Union[xr.DataArray, str, pathlib.Path]]],
var_name: str = "Sv",
fill_value: Union[int, float, np.ndarray, xr.DataArray] = np.nan,
fill_value: Union[int, float, xr.DataArray] = np.nan,
storage_options_ds: dict = {},
storage_options_mask: Union[dict, List[dict]] = {},
) -> xr.Dataset:
"""
Applies the provided mask(s) to the Sv variable ``var_name``
in the provided Dataset ``source_ds``.

The code allows for these 3 cases of `source_ds` and `mask` dimensions:

1) No channel in both source ds and mask, but they have matching ping time and depth
dimensions.
2) Source ds and mask both have matching channel, ping time, and depth dimensions.
3) Source ds has channel dimension and mask doesn't, but they have matching ping
time and depth dimensions.

If a user only wants to apply masks to a subset of the channels in source ds,
they could put 1s/0s for all data in the channels that are not masked.

Parameters
----------
source_ds: xr.Dataset, str, or pathlib.Path
Points to a Dataset that contains the variable the mask should be applied to
mask: xr.DataArray, str, pathlib.Path, or a list of these datatypes
The mask(s) to be applied.
Can be a single input or list that corresponds to a DataArray or a path.
Each entry in the list must have dimensions ``('ping_time', 'range_sample')``.
Multi-channel masks are not currently supported.
Can be a individual input or a list that corresponds to a DataArray or a path.
Each individual input or entry in the list must contain dimensions
``('ping_time', 'range_sample')`` or dimensions ``('ping_time', 'depth')``.
The mask can also contain the dimension ``channel``.
If a path is provided this should point to a zarr or netcdf file with only
one data variable in it.
If the input ``mask`` is a list, a logical AND will be used to produce the final
mask that will be applied to ``var_name``.
var_name: str, default="Sv"
The Sv variable name in ``source_ds`` that the mask should be applied to.
This variable needs to have coordinates ``ping_time`` and ``range_sample``,
and can optionally also have coordinate ``channel``.
This variable needs to have coordinates ``('ping_time', 'range_sample')`` or
coordinates ``('ping_time', 'depth')``, and can optionally also have coordinate
``channel``.
In the case of a multi-channel Sv data variable, the ``mask`` will be broadcast
to all channels.
fill_value: int, float, np.ndarray, or xr.DataArray, default=np.nan
fill_value: int, float, or xr.DataArray, default=np.nan
Value(s) at masked indices.
If ``fill_value`` is of type ``np.ndarray`` or ``xr.DataArray``,
it must have the same shape as each entry of ``mask``.
If ``fill_value`` is of type ``xr.DataArray`` it must have the same shape as each
entry of ``mask``.
storage_options_ds: dict, default={}
Any additional parameters for the storage backend, corresponding to the
path provided for ``source_ds``
Expand All @@ -305,43 +329,49 @@ def apply_mask(

# Obtain final mask to be applied to var_name
if isinstance(mask, list):
# perform a logical AND element-wise operation across the masks
final_mask = np.logical_and.reduce(mask)
# Broadcast all input masks together before combining them
broadcasted_masks = xr.broadcast(*mask)

# Perform a logical AND element-wise operation across the masks
final_mask = np.logical_and.reduce(broadcasted_masks)

# xr.where has issues with attrs when final_mask is an array, so we make it a DataArray
final_mask = xr.DataArray(final_mask, coords=mask[0].coords)
final_mask = xr.DataArray(final_mask, coords=broadcasted_masks[0].coords)
else:
final_mask = mask

# Sanity check: final_mask should be of the same shape as source_ds[var_name]
# along the ping_time and range_sample dimensions
def get_ch_shape(da):
return da.isel(channel=0).shape if "channel" in da.dims else da.shape

# Below operate on the actual data array to be masked
# Operate on the actual data array to be masked
source_da = source_ds[var_name]

source_da_shape = get_ch_shape(source_da)
final_mask_shape = get_ch_shape(final_mask)

if final_mask_shape != source_da_shape:
# The final_mask should be of the same shape as source_ds[var_name]
# along the ping_time and range_sample dimensions.
source_da_chan_shape = (
source_da.isel(channel=0).shape if "channel" in source_da.dims else source_da.shape
)
final_mask_chan_shape = (
final_mask.isel(channel=0).shape if "channel" in final_mask.dims else final_mask.shape
)
if final_mask_chan_shape != source_da_chan_shape:
raise ValueError(
f"The final constructed mask is not of the same shape as source_ds[{var_name}] "
"along the ping_time and range_sample dimensions!"
"along the ping_time, and range_sample dimensions!"
)

# final_mask is always an xr.DataArray with at most length=1 channel dimension
if "channel" in final_mask.dims:
final_mask = final_mask.isel(channel=0)

# Make sure fill_value and final_mask are expanded in dimensions
if "channel" in source_da.dims:
if isinstance(fill_value, np.ndarray):
fill_value = np.array([fill_value] * source_da["channel"].size)
final_mask = np.array([final_mask.data] * source_da["channel"].size)
# If final_mask has dim channel then source_da must have dim channel
if "channel" in final_mask.dims and "channel" not in source_da.dims:
raise ValueError(
"The final constructed mask has dim channel, "
f"so source_ds[{var_name}] must have dim channel."
)
# If final_mask and source_da both have channel dimension, then they must
# have the same number of channels.
elif "channel" in final_mask.dims and "channel" in source_da.dims:
if len(final_mask["channel"]) != len(source_da["channel"]):
raise ValueError(
f"If both the final constructed mask and source_ds[{var_name}] "
"have channel then they must have matching channel dimensions."
)

# Apply the mask to var_name
# Somehow keep_attrs=True errors out here, so will attach later
var_name_masked = xr.where(final_mask, x=source_da, y=fill_value)

# Obtain a shallow copy of source_ds
Expand All @@ -356,12 +386,11 @@ def get_ch_shape(da):
_variable_prov_attrs(output_ds[var_name], mask)
)

# Attribute handling
process_type = "mask"
prov_dict = echopype_prov_attrs(process_type=process_type)
prov_dict[f"{process_type}_function"] = "mask.apply_mask"

output_ds = output_ds.assign_attrs(prov_dict)

output_ds = insert_input_processing_level(output_ds, input_ds=source_ds)

return output_ds
Expand Down
Loading