Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address tabulation performance #163

Merged
merged 6 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

Expand Down
2 changes: 1 addition & 1 deletion docs/markdown/01_INTRO.MD
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ GVAL (pronounced "g-val") is a high-level Python framework to evaluate the skill

GVAL is intended to work on raster and vector files as xarray and geopandas objects, respectively. Abilities to prepare or homogenize maps for comparison are included. The comparisons are based on scoring philosophies for three statistical data types including categorical, continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:
- Our current public API and output formats are likely to change in the future.
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx/PYPI_README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](noaa-owp.github.io/gval/).
See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

Expand Down
2 changes: 2 additions & 0 deletions docs/sphinx/SPHINX_README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ for comparison are included. The comparisons are based on scoring
philosophies for three statistical data types including categorical,
continuous, and probabilistic.

See the full documentation [here](https://noaa-owp.github.io/gval/).

WARNING:

- Our current public API and output formats are likely to change in the
Expand Down
22 changes: 12 additions & 10 deletions docs/sphinx/SphinxContinuousTutorial.ipynb

Large diffs are not rendered by default.

44 changes: 19 additions & 25 deletions docs/sphinx/SphinxMulticatTutorial.ipynb

Large diffs are not rendered by default.

74 changes: 39 additions & 35 deletions docs/sphinx/SphinxTutorial.ipynb

Large diffs are not rendered by default.

22 changes: 12 additions & 10 deletions notebooks/Continuous Comparison Tutorial.ipynb

Large diffs are not rendered by default.

44 changes: 19 additions & 25 deletions notebooks/Multi-Class Categorical Statistics.ipynb

Large diffs are not rendered by default.

76 changes: 40 additions & 36 deletions notebooks/Tutorial.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors = [
requires-python = ">=3.8"
keywords = ["geospatial", "evaluations"]
license = {text = "MIT"}
version = "0.2.2"
version = "0.2.3"
dynamic = ["readme", "dependencies"]


Expand Down
105 changes: 37 additions & 68 deletions src/gval/accessors/gval_xarray.py
GregoryPetrochenkov-NOAA marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from gval.comparison.compute_categorical_metrics import _compute_categorical_metrics
from gval.comparison.compute_continuous_metrics import _compute_continuous_metrics
from gval.attributes.attributes import _attribute_tracking_xarray
from gval.utils.loading_datasets import _parse_string_attributes
from gval.utils.schemas import Crosstab_df, Metrics_df, AttributeTrackingDf
from gval.utils.visualize import _map_plot
from gval.comparison.pairing_functions import difference
Expand All @@ -40,7 +41,7 @@ class GVALXarray:
"""

def __init__(self, xarray_obj):
self._obj = xarray_obj
self._obj = _parse_string_attributes(xarray_obj)
self.data_type = type(xarray_obj)
self.agreement_map_format = "raster"

Expand Down Expand Up @@ -232,12 +233,7 @@ def categorical_compare(
)

crosstab_df = candidate.gval.compute_crosstab(
benchmark_map=benchmark,
allow_candidate_values=allow_candidate_values,
allow_benchmark_values=allow_benchmark_values,
exclude_value=exclude_value,
comparison_function=comparison_function,
subsampling_df=subsampling_df,
agreement_map=agreement_map, subsampling_df=subsampling_df
)

metrics_df = _compute_categorical_metrics(
Expand All @@ -250,6 +246,7 @@ def categorical_compare(
sampling_average=subsampling_average,
)

vector_agreement = self.agreement_map_format == "vector"
if attribute_tracking:
results = self.__handle_attribute_tracking(
candidate_map=candidate,
Expand All @@ -263,11 +260,17 @@ def categorical_compare(
else:
attributes_df = results

del candidate, benchmark
agreement_map = (
agreement_map.gval.vectorize_data()
if vector_agreement
else agreement_map
)

return agreement_map, crosstab_df, metrics_df, attributes_df

del candidate, benchmark
agreement_map = (
agreement_map.gval.vectorize_data() if vector_agreement else agreement_map
)

return agreement_map, crosstab_df, metrics_df

Expand Down Expand Up @@ -378,9 +381,7 @@ def continuous_compare(

# If sampling_df return type gives three values assign all vars results, otherwise only agreement map results
agreement_map, candidate_map, benchmark_map = (
results
if subsampling_df is not None
else (results, self._obj, benchmark_map)
results if subsampling_df is not None else (results, candidate, benchmark)
)

metrics_df = _compute_continuous_metrics(
Expand All @@ -405,8 +406,12 @@ def continuous_compare(
else:
attributes_df = results

del candidate_map, benchmark_map

return agreement_map, metrics_df, attributes_df

del candidate_map, benchmark_map

return agreement_map, metrics_df

def homogenize(
Expand Down Expand Up @@ -549,6 +554,7 @@ def compute_agreement_map(
allow_benchmark_values=allow_benchmark_values,
nodata=nodata,
encode_nodata=encode_nodata,
continuous=continuous,
)

# Preserve sampled maps for continuous statistics, otherwise delete
Expand All @@ -558,9 +564,6 @@ def compute_agreement_map(
else:
del candidate_copy, benchmark_copy

if self.agreement_map_format == "vector":
agreement_map = agreement_map.gval.vectorize_data()

agreement_maps.append(agreement_map)

if subsampling_df is not None:
Expand All @@ -575,40 +578,21 @@ def compute_agreement_map(

return agreement_maps[0]

@Comparison.comparison_function_from_string
def compute_crosstab(
self,
benchmark_map: Union[xr.Dataset, xr.DataArray],
allow_candidate_values: Optional[Iterable[Number]] = None,
allow_benchmark_values: Optional[Iterable[Number]] = None,
exclude_value: Optional[Number] = None,
comparison_function: Optional[
Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]
] = "szudzik",
pairing_dict: Optional[Dict[Tuple[Number, Number], Number]] = None,
agreement_map: Optional[
Union[xr.DataArray, xr.Dataset, Iterable[Union[xr.DataArray, xr.Dataset]]]
] = None,
subsampling_df: Optional[gpd.GeoDataFrame] = None,
) -> DataFrame[Crosstab_df]:
"""
Crosstab 2 or 3-dimensional xarray DataArray to produce Crosstab DataFrame.

Parameters
----------
benchmark_map : Union[xr.Dataset, xr.DataArray]
agreement_map : Union[xr.Dataset, xr.DataArray], default = None
Benchmark map, {dimension}-dimensional.
allow_candidate_values : Optional[Iterable[Union[int,float]]], default = None
Sequence of values in candidate to include in crosstab. Remaining values are excluded.
allow_benchmark_values : Optional[Iterable[Union[int,float]]], default = None
Sequence of values in benchmark to include in crosstab. Remaining values are excluded.
exclude_value : Optional[Number], default = None
Value to exclude from crosstab. This could be used to denote a no data value if masking wasn't used. By default, NaNs are not cross-tabulated.
comparison_function : Optional[Union[Callable, nb.np.ufunc.dufunc.DUFunc, np.ufunc, np.vectorize, str]], default = "szudzik"
Function to compute agreement values. If None, then no agreement values are computed.
pairing_dict: Optional[Dict[Tuple[Number, Number], Number]], default = None
When "pairing_dict" is used for the comparison_function argument, a pairing dictionary can be passed by user. A pairing dictionary is structured as `{(c, b) : a}` where `(c, b)` is a tuple of the candidate and benchmark value pairing, respectively, and `a` is the value for the agreement array to be used for this pairing.

If None is passed for pairing_dict, the allow_candidate_values and allow_benchmark_values arguments are required. For this case, the pairings in these two iterables will be paired in the order provided and an agreement value will be assigned to each pairing starting with 0 and ending with the number of possible pairings.

A pairing dictionary can be used by the user to note which values to allow and which to ignore for comparisons. It can also be used to decide how nans are handled for cases where either the candidate and benchmark maps have nans or both.
subsampling_df: Optional[gpd.GeoDataFrame], default = None
DataFrame with spatial geometries and method types to subsample

Expand All @@ -618,38 +602,23 @@ def compute_crosstab(
DataFrame[Crosstab_df]
Crosstab DataFrame
"""
self.check_same_type(benchmark_map)

results = (
subsample(
candidate=self._obj,
benchmark=benchmark_map,
subsampling_df=subsampling_df,
# Use self if agreement_map argument is not provided otherwise use agreement_map parameter
if agreement_map is not None:
agreement_map = (
agreement_map if isinstance(agreement_map, list) else [agreement_map]
)
if subsampling_df is not None
else [[self._obj, benchmark_map]]
)
else:
agreement_map = [self._obj]

# Create cross-tabulation table for each agreement map and concatenate them
crosstabs = []
for idx, (candidate, benchmark) in enumerate(results):
if isinstance(self._obj, xr.Dataset):
crosstab = _crosstab_Datasets(
candidate,
benchmark,
allow_candidate_values,
allow_benchmark_values,
exclude_value,
comparison_function,
)
else:
crosstab = _crosstab_DataArrays(
candidate,
benchmark,
allow_candidate_values,
allow_benchmark_values,
exclude_value,
comparison_function,
)
for idx, agreement in enumerate(agreement_map):
crosstab = (
_crosstab_Datasets(agreement)
if isinstance(self._obj, xr.Dataset)
else _crosstab_DataArrays(agreement)
)

if subsampling_df is not None:
crosstab.insert(
Expand Down Expand Up @@ -728,7 +697,7 @@ def cat_plot(
legend_labels: list = None,
plot_bands: Union[str, list] = "all",
colorbar_label: Union[str, list] = "",
basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
):
"""
Plots categorical Map for xarray object
Expand Down Expand Up @@ -775,7 +744,7 @@ def cont_plot(
figsize: Tuple[int, int] = None,
plot_bands: Union[str, list] = "all",
colorbar_label: Union[str, list] = "",
basemap: xyzservices.lib.TileProvider = cx.providers.Stamen.Terrain,
basemap: xyzservices.lib.TileProvider = cx.providers.OpenStreetMap.Mapnik,
):
"""
Plots categorical Map for xarray object
Expand Down
53 changes: 51 additions & 2 deletions src/gval/comparison/agreement.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@

from typing import Iterable, Optional, Union, Tuple, Callable, Dict
from numbers import Number
from itertools import product

import numpy as np
import xarray as xr
import numba as nb

import dask

from gval.comparison.pairing_functions import (
_make_pairing_dict_fn,
)
from gval.utils.loading_datasets import _handle_xarray_memory
from gval.utils.loading_datasets import _handle_xarray_memory, _check_dask_array


def _compute_agreement_map(
Expand All @@ -39,6 +40,7 @@ def _compute_agreement_map(
allow_benchmark_values: Optional[Iterable[Number]] = None,
nodata: Optional[Number] = None,
encode_nodata: Optional[bool] = False,
continuous: Optional[bool] = False,
) -> Union[xr.DataArray, xr.Dataset]:
"""
Computes agreement map as xarray from candidate and benchmark xarray's.
Expand Down Expand Up @@ -179,6 +181,53 @@ def _manage_information_loss(agreement_map, crs, nodata, encode_nodata, dtype):
comparison_function, *ufunc_args, **apply_ufunc_kwargs
)

is_dask = _check_dask_array(candidate_map)

def get_unique_values(candidate, benchmark):
unique_c = (
dask.array.unique(candidate.data).compute()
if is_dask
else np.unique(candidate)
)
unique_b = (
dask.array.unique(benchmark.data).compute()
if is_dask
else np.unique(benchmark)
)

return unique_c, unique_b

# Add pairing dictionary and reverse pairing dictionary to agreement map attributes
if pairing_dict is not None and not continuous:
agreement_map.attrs["pairing_dictionary"] = pairing_dict

if pairing_dict is None and not continuous:
if allow_candidate_values is None or allow_benchmark_values is None:
if isinstance(candidate_map, xr.Dataset):
for idx, var in enumerate(candidate_map.data_vars):
agreement_map[var].attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(
*get_unique_values(candidate_map[var], benchmark_map[var])
)
}

if idx == 0:
agreement_map.attrs

else:
agreement_map.attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(
*get_unique_values(candidate_map, benchmark_map)
)
}
else:
agreement_map.attrs["pairing_dictionary"] = {
(x, y): comparison_function(x, y)
for x, y in product(allow_candidate_values, allow_benchmark_values)
}

if isinstance(candidate_map, xr.DataArray):
agreement_map = _manage_information_loss(
agreement_map=agreement_map,
Expand Down
8 changes: 7 additions & 1 deletion src/gval/comparison/compute_categorical_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ def _handle_positive_negative_categories(

# finds the unique values in the sample's candidate and benchmark values
unique_values = set(
crosstab_df.loc[:, ["candidate_values", "benchmark_values"]].to_numpy().ravel()
[
item
for item in crosstab_df.loc[:, ["candidate_values", "benchmark_values"]]
.to_numpy()
.ravel()
if not isinstance(item, list)
]
)

# this checks that user passed positive or negative categories exist in sample df
Expand Down
Loading