Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Property tests with hypothesis #348

Merged
merged 58 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
d8934c0
Property tests with hypothesis
dcherian Mar 30, 2024
6310ce3
skip on minimal env
dcherian Mar 31, 2024
926a95e
fix typing
dcherian Mar 31, 2024
4cbb562
fix test
dcherian Mar 31, 2024
d91234c
fix mypy
dcherian Mar 31, 2024
6ef6987
remove docstring
dcherian Mar 31, 2024
60f11b8
try again
dcherian Mar 31, 2024
c230b94
fix again
dcherian Mar 31, 2024
b77f573
more fix
dcherian Apr 1, 2024
698206e
fix tests
dcherian Apr 1, 2024
e91b5d7
Try fix
dcherian Apr 1, 2024
b0c550b
some debug logging instead of info
dcherian Apr 1, 2024
a9097c2
try `int8`
dcherian Apr 1, 2024
28db1dc
Update casting behaviour
dcherian Apr 1, 2024
750675e
More dtypes
dcherian Apr 1, 2024
3c0871b
Complex fixes
dcherian Apr 1, 2024
2dce87c
Revert "try `int8`"
dcherian Apr 1, 2024
624f9e1
fix dtype
dcherian Apr 1, 2024
95087c4
skip complex var, std
dcherian Apr 1, 2024
4529685
Start fixing timedelta64
dcherian Apr 1, 2024
ffeac6f
fix casting
dcherian Apr 1, 2024
4694eb5
exclude timedelta64, datetime64
dcherian Apr 1, 2024
a709063
tweak
dcherian Apr 1, 2024
2d58fe3
filter out too_slow
dcherian Apr 11, 2024
b0cde16
update hypothesis cache
dcherian Apr 11, 2024
534d890
fix
dcherian Apr 11, 2024
97c7e4c
fix more.
dcherian Apr 11, 2024
0f4f5c5
update caching strategy
dcherian Apr 11, 2024
771d269
WIP
dcherian Apr 11, 2024
268e17c
Skip float16
dcherian Apr 24, 2024
57fa6cb
Attempt to increase numerical stablity of var, std
dcherian Apr 24, 2024
f185704
update tolerances
dcherian Apr 24, 2024
ae17a37
Merge branch 'main' into hypothesis
dcherian Apr 24, 2024
93627ad
fix
dcherian Apr 24, 2024
673396c
update action
dcherian Apr 24, 2024
0420e4a
fixes
dcherian Apr 24, 2024
95462fe
Trim CI
dcherian Apr 24, 2024
6cb1780
Cast to int64 instead of intp
dcherian Apr 24, 2024
ec767df
Merge branch 'main' into hypothesis
dcherian Apr 24, 2024
1340b00
revert?
dcherian Apr 24, 2024
d143a98
[revert]
dcherian Apr 24, 2024
a02d947
try again
dcherian Apr 24, 2024
485bd7e
debug logging
dcherian Apr 24, 2024
35ff742
Revert "try again"
dcherian Apr 24, 2024
9536a95
adapt
dcherian Apr 24, 2024
ce9eac6
Revert "Revert "try again""
dcherian Apr 24, 2024
7eb0d80
Fix cast
dcherian Jun 29, 2024
c5bebe1
remove prints
dcherian Jun 29, 2024
43195ce
Revert "[revert]"
dcherian Jun 29, 2024
b60a0a2
info -> debug
dcherian Jun 30, 2024
63867ef
Fix quantiles
dcherian Jun 30, 2024
b63a20b
bring back notes
dcherian Jun 30, 2024
da8fb41
Small opt
dcherian Jun 30, 2024
d3ce5d2
Just skip var, std
dcherian Jun 30, 2024
79790f7
Fix mypy
dcherian Jun 30, 2024
240587f
no-redef
dcherian Jun 30, 2024
4259634
Merge branch 'main' into hypothesis
dcherian Jun 30, 2024
a1ddad4
try again
dcherian Jun 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ concurrency:
cancel-in-progress: true

jobs:
build:
name: Build (${{ matrix.python-version }}, ${{ matrix.os }})
test:
name: Test (${{ matrix.python-version }}, ${{ matrix.os }})
runs-on: ${{ matrix.os }}
defaults:
run:
Expand Down Expand Up @@ -48,7 +48,19 @@ jobs:
- name: Install flox
run: |
python -m pip install --no-deps -e .

# https://github.com/actions/cache/blob/main/tips-and-workarounds.md#update-a-cache
- name: Restore cached hypothesis directory
id: restore-hypothesis-cache
uses: actions/cache/restore@v4
with:
path: .hypothesis/
key: cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-${{ github.run_id }}
restore-keys: |
cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-

- name: Run Tests
id: status
run: |
pytest -n auto --cov=./ --cov-report=xml
- name: Upload code coverage to Codecov
Expand All @@ -60,6 +72,15 @@ jobs:
name: codecov-umbrella
fail_ci_if_error: false

# explicitly save the cache so it gets updated, also do this even if it fails.
- name: Save cached hypothesis directory
id: save-hypothesis-cache
if: always() && steps.status.outcome != 'skipped'
uses: actions/cache/save@v4
with:
path: .hypothesis/
key: cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-${{ github.run_id }}

optional-deps:
name: ${{ matrix.env }}
runs-on: "ubuntu-latest"
Expand Down
1 change: 1 addition & 0 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ dependencies:
- toolz
- numba
- numbagg>=0.3
- hypothesis
35 changes: 30 additions & 5 deletions flox/aggregate_npg.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,28 @@ def _len(group_idx, array, engine, *, func, axis=-1, size=None, fill_value=None,
nanlen = partial(_len, func="nanlen")


def _var_std_wrapper(group_idx, array, engine, *, axis=-1, **kwargs):
# Attempt to increase numerical stability by subtracting the first element.
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
# Cast any unsigned types first
dtype = np.result_type(array, np.int8(-1) * array[0])
array = array.astype(dtype, copy=False)
first = _get_aggregate(engine).aggregate(group_idx, array, func="nanfirst", axis=axis)
array = array - first[..., group_idx]
return _get_aggregate(engine).aggregate(group_idx, array, axis=axis, **kwargs)


var = partial(_var_std_wrapper, func="var")
nanvar = partial(_var_std_wrapper, func="nanvar")
std = partial(_var_std_wrapper, func="std")
nanstd = partial(_var_std_wrapper, func="nanstd")


def median(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None):
return npg.aggregate_numpy.aggregate(
group_idx,
array,
func=partial(_casting_wrapper, np.median, dtype=array.dtype),
func=partial(_casting_wrapper, np.median, dtype=np.result_type(array.dtype)),
axis=axis,
size=size,
fill_value=fill_value,
Expand All @@ -125,7 +142,7 @@ def nanmedian(group_idx, array, engine, *, axis=-1, size=None, fill_value=None,
return npg.aggregate_numpy.aggregate(
group_idx,
array,
func=partial(_casting_wrapper, np.nanmedian, dtype=array.dtype),
func=partial(_casting_wrapper, np.nanmedian, dtype=np.result_type(array.dtype)),
axis=axis,
size=size,
fill_value=fill_value,
Expand All @@ -137,7 +154,11 @@ def quantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=None
return npg.aggregate_numpy.aggregate(
group_idx,
array,
func=partial(_casting_wrapper, partial(np.quantile, q=q), dtype=array.dtype),
func=partial(
_casting_wrapper,
partial(np.quantile, q=q),
dtype=np.result_type(dtype, array.dtype),
),
axis=axis,
size=size,
fill_value=fill_value,
Expand All @@ -149,7 +170,11 @@ def nanquantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=N
return npg.aggregate_numpy.aggregate(
group_idx,
array,
func=partial(_casting_wrapper, partial(np.nanquantile, q=q), dtype=array.dtype),
func=partial(
_casting_wrapper,
partial(np.nanquantile, q=q),
dtype=np.result_type(dtype, array.dtype),
),
axis=axis,
size=size,
fill_value=fill_value,
Expand All @@ -163,7 +188,7 @@ def mode_(array, nan_policy, dtype):
# npg splits `array` into object arrays for each group
# scipy.stats.mode does not like that
# here we cast back
return mode(array.astype(dtype, copy=False), nan_policy=nan_policy, axis=-1).mode
return mode(array.astype(dtype, copy=False), nan_policy=nan_policy, axis=-1, keepdims=True).mode


def mode(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None):
Expand Down
25 changes: 20 additions & 5 deletions flox/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,27 @@ def _normalize_dtype(dtype: DTypeLike, array_dtype: np.dtype, fill_value=None) -
return dtype


def _maybe_promote_int(dtype) -> np.dtype:
# https://numpy.org/doc/stable/reference/generated/numpy.prod.html
# The dtype of a is used by default unless a has an integer dtype of less precision
# than the default platform integer.
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
if dtype.kind == "i":
dtype = np.result_type(dtype, np.intp)
elif dtype.kind == "u":
dtype = np.result_type(dtype, np.uintp)
return dtype


def _get_fill_value(dtype, fill_value):
"""Returns dtype appropriate infinity. Returns +Inf equivalent for None."""
if fill_value == dtypes.INF or fill_value is None:
return dtypes.get_pos_infinity(dtype, max_for_int=True)
if fill_value == dtypes.NINF:
return dtypes.get_neg_infinity(dtype, min_for_int=True)
if fill_value == dtypes.NA:
if np.issubdtype(dtype, np.floating):
if np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.complexfloating):
return np.nan
# This is madness, but npg checks that fill_value is compatible
# with array dtype even if the fill_value is never used.
Expand Down Expand Up @@ -524,10 +537,10 @@ def _pick_second(*x):
# Support statistical quantities only blockwise
# The parallel versions will be approximate and are hard to implement!
median = Aggregation(
name="median", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64
name="median", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.floating
)
nanmedian = Aggregation(
name="nanmedian", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64
name="nanmedian", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.floating
)


Expand All @@ -540,15 +553,15 @@ def quantile_new_dims_func(q) -> tuple[Dim]:
fill_value=dtypes.NA,
chunk=None,
combine=None,
final_dtype=np.float64,
final_dtype=np.floating,
new_dims_func=quantile_new_dims_func,
)
nanquantile = Aggregation(
name="nanquantile",
fill_value=dtypes.NA,
chunk=None,
combine=None,
final_dtype=np.float64,
final_dtype=np.floating,
new_dims_func=quantile_new_dims_func,
)
mode = Aggregation(name="mode", fill_value=dtypes.NA, chunk=None, combine=None)
Expand Down Expand Up @@ -618,6 +631,8 @@ def _initialize_aggregation(
)

final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value)
if agg.name not in ["min", "max", "nanmin", "nanmax"]:
final_dtype = _maybe_promote_int(final_dtype)
agg.dtype = {
"user": dtype, # Save to automatically choose an engine
"final": final_dtype,
Expand Down
34 changes: 17 additions & 17 deletions flox/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,12 @@ def invert(x) -> tuple[np.ndarray, ...]:

# 2. Every group is contained to one block, use blockwise here.
if bitmask.shape[CHUNK_AXIS] == 1 or (chunks_per_label == 1).all():
logger.info("find_group_cohorts: blockwise is preferred.")
logger.debug("find_group_cohorts: blockwise is preferred.")
return "blockwise", chunks_cohorts

# 3. Perfectly chunked so there is only a single cohort
if len(chunks_cohorts) == 1:
logger.info("Only found a single cohort. 'map-reduce' is preferred.")
logger.debug("Only found a single cohort. 'map-reduce' is preferred.")
return "map-reduce", chunks_cohorts if merge else {}

# 4. Our dataset has chunksize one along the axis,
Expand All @@ -418,7 +418,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
# 6. Existing cohorts don't overlap, great for time grouping with perfect chunking
no_overlapping_cohorts = (np.bincount(np.concatenate(tuple(chunks_cohorts.keys()))) == 1).all()
if one_group_per_chunk or single_chunks or no_overlapping_cohorts:
logger.info("find_group_cohorts: cohorts is preferred, chunking is perfect.")
logger.debug("find_group_cohorts: cohorts is preferred, chunking is perfect.")
return "cohorts", chunks_cohorts

# We'll use containment to measure degree of overlap between labels.
Expand Down Expand Up @@ -451,7 +451,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
# 7. Groups seem fairly randomly distributed, use "map-reduce".
if sparsity > MAX_SPARSITY_FOR_COHORTS:
if not merge:
logger.info(
logger.debug(
"find_group_cohorts: bitmask sparsity={}, merge=False, choosing 'map-reduce'".format( # noqa
sparsity
)
Expand Down Expand Up @@ -480,7 +480,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
containment.eliminate_zeros()

# Iterate over labels, beginning with those with most chunks
logger.info("find_group_cohorts: merging cohorts")
logger.debug("find_group_cohorts: merging cohorts")
order = np.argsort(containment.sum(axis=LABEL_AXIS))[::-1]
merged_cohorts = {}
merged_keys = set()
Expand Down Expand Up @@ -1957,7 +1957,7 @@ def _validate_reindex(
any_by_dask: bool,
is_dask_array: bool,
) -> bool | None:
logger.info("Entering _validate_reindex: reindex is {}".format(reindex)) # noqa
# logger.debug("Entering _validate_reindex: reindex is {}".format(reindex)) # noqa

all_numpy = not is_dask_array and not any_by_dask
if reindex is True and not all_numpy:
Expand All @@ -1972,7 +1972,7 @@ def _validate_reindex(

if reindex is None:
if method is None:
logger.info("Leaving _validate_reindex: method = None, returning None")
# logger.debug("Leaving _validate_reindex: method = None, returning None")
return None

if all_numpy:
Expand All @@ -1999,7 +1999,7 @@ def _validate_reindex(
reindex = True

assert isinstance(reindex, bool)
logger.info("Leaving _validate_reindex: reindex is {}".format(reindex)) # noqa
logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex)) # noqa

return reindex

Expand Down Expand Up @@ -2165,24 +2165,24 @@ def _choose_method(
method: T_MethodOpt, preferred_method: T_Method, agg: Aggregation, by, nax: int
) -> T_Method:
if method is None:
logger.info("_choose_method: method is None")
logger.debug("_choose_method: method is None")
if agg.chunk == (None,):
if preferred_method != "blockwise":
raise ValueError(
f"Aggregation {agg.name} is only supported for `method='blockwise'`, "
"but the chunking is not right."
)
logger.info("_choose_method: choosing 'blockwise'")
logger.debug("_choose_method: choosing 'blockwise'")
return "blockwise"

if nax != by.ndim:
logger.info("_choose_method: choosing 'map-reduce'")
logger.debug("_choose_method: choosing 'map-reduce'")
return "map-reduce"

if _is_arg_reduction(agg) and preferred_method == "blockwise":
return "cohorts"

logger.info("_choose_method: choosing preferred_method={}".format(preferred_method)) # noqa
logger.debug(f"_choose_method: choosing preferred_method={preferred_method}") # noqa
return preferred_method
else:
return method
Expand All @@ -2194,7 +2194,7 @@ def _choose_engine(by, agg: Aggregation):
not_arg_reduce = not _is_arg_reduction(agg)

if agg.name in ["quantile", "nanquantile", "median", "nanmedian"]:
logger.info(f"_choose_engine: Choosing 'flox' since {agg.name}")
logger.debug(f"_choose_engine: Choosing 'flox' since {agg.name}")
return "flox"

# numbagg only supports nan-skipping reductions
Expand All @@ -2206,14 +2206,14 @@ def _choose_engine(by, agg: Aggregation):
if agg.name in ["all", "any"] or (
not_arg_reduce and has_blockwise_nan_skipping and dtype is None
):
logger.info("_choose_engine: Choosing 'numbagg'")
logger.debug("_choose_engine: Choosing 'numbagg'")
return "numbagg"

if not_arg_reduce and (not is_duck_dask_array(by) and _issorted(by)):
logger.info("_choose_engine: Choosing 'flox'")
logger.debug("_choose_engine: Choosing 'flox'")
return "flox"
else:
logger.info("_choose_engine: Choosing 'numpy'")
logger.debug("_choose_engine: Choosing 'numpy'")
return "numpy"


Expand Down Expand Up @@ -2389,7 +2389,7 @@ def groupby_reduce(
if not is_duck_array(array):
array = np.asarray(array)
is_bool_array = np.issubdtype(array.dtype, bool)
array = array.astype(int) if is_bool_array else array
array = array.astype(np.intp) if is_bool_array else array

isbins = _atleast_1d(isbin, nby)

Expand Down
4 changes: 4 additions & 0 deletions flox/xrdtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ def get_neg_infinity(dtype, min_for_int=False):
-------
fill_value : positive infinity value corresponding to this dtype.
"""

if np.issubdtype(dtype, (np.timedelta64, np.datetime64)):
return dtype.type(np.iinfo(np.int64).min + 1)

if issubclass(dtype.type, np.floating):
return -np.inf

Expand Down
27 changes: 14 additions & 13 deletions flox/xrutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,6 @@
import pandas as pd
from packaging.version import Version

try:
import cftime
except ImportError:
cftime = None


try:
import dask.array

dask_array_type = dask.array.Array
except ImportError:
dask_array_type = () # type: ignore[assignment, misc]


def module_available(module: str, minversion: Optional[str] = None) -> bool:
"""Checks whether a module is installed without importing it.
Expand Down Expand Up @@ -55,6 +42,20 @@ def module_available(module: str, minversion: Optional[str] = None) -> bool:
from numpy.core.numeric import normalize_axis_index # type: ignore[attr-defined]


try:
import cftime
except ImportError:
cftime = None


try:
import dask.array

dask_array_type = dask.array.Array
except ImportError:
dask_array_type = () # type: ignore[assignment, misc]


def asarray(data, xp=np):
return data if is_duck_array(data) else xp.asarray(data)

Expand Down
Loading
Loading