diff --git a/.git_archival.txt b/.git_archival.txt new file mode 100644 index 000000000..3994ec0a8 --- /dev/null +++ b/.git_archival.txt @@ -0,0 +1,4 @@ +node: $Format:%H$ +node-date: $Format:%cI$ +describe-name: $Format:%(describe:tags=true)$ +ref-names: $Format:%D$ diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 5c6e03a2e..9521b7b68 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -22,12 +22,13 @@ jobs: fetch-depth: 0 - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v13 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - # extra-specs: | + init-shell: bash + cache-environment: true + # create-args: | # python="${{ matrix.python-version }}" # - name: Setup some dependencies diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 605e20cf0..9449d9290 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1.1 + - uses: xarray-contrib/ci-trigger@v1.2 id: detect-trigger with: keyword: "[skip-ci]" @@ -53,14 +53,15 @@ jobs: echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@34071ca7df4983ccd272ed0d3625818b27b70dcc + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: flox-tests - extra-specs: | - python=${{env.PYTHON_VERSION}} - cache-env: true + init-shell: bash + cache-environment: true cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + create-args: | + python=${{ env.PYTHON_VERSION }} - name: Install flox run: | @@ -71,7 +72,15 @@ jobs: conda list - name: Run doctests run: | - python -m pytest --doctest-modules flox --ignore flox/tests + python -m pytest --doctest-modules flox --ignore flox/tests --cov=./ --cov-report=xml + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: RUNNER_OS + name: codecov-umbrella + fail_ci_if_error: false mypy: name: Mypy @@ -94,15 +103,16 @@ jobs: run: | echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@34071ca7df4983ccd272ed0d3625818b27b70dcc + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests - extra-specs: | - python=${{env.PYTHON_VERSION}} - cache-env: true + environment-name: flox-tests + init-shell: bash + cache-environment: true cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" - - name: Install xarray + create-args: | + python=${{ env.PYTHON_VERSION }} + - name: Install flox run: | python -m pip install --no-deps -e . - name: Version info @@ -115,4 +125,13 @@ jobs: - name: Run mypy run: | - python -m mypy --install-types --non-interactive + python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report + + - name: Upload mypy coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: mypy_report/cobertura.xml + flags: mypy + env_vars: PYTHON_VERSION + name: codecov-umbrella + fail_ci_if_error: false diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7133527c0..75aafc4b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -24,7 +24,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-latest", "windows-latest"] python-version: ["3.8", "3.10"] steps: - uses: actions/checkout@v3 @@ -34,22 +34,22 @@ jobs: run: | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v13 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - extra-specs: | - python="${{ matrix.python-version }}" + init-shell: bash + cache-environment: true + create-args: | + python=${{ matrix.python-version }} - name: Install flox run: | - python -m pip install -e . - conda list + python -m pip install --no-deps -e . - name: Run Tests run: | pytest -n auto --cov=./ --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.1 + uses: codecov/codecov-action@v3.1.4 with: file: ./coverage.xml flags: unittests @@ -78,38 +78,28 @@ jobs: with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v13 + uses: mamba-org/setup-micromamba@v1 with: - environment-file: ci/${{ matrix.env }}.yml + environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - extra-specs: | - python="${{ matrix.python-version }}" + init-shell: bash + cache-environment: true + create-args: | + python=${{ matrix.python-version }} - name: Install flox run: | python -m pip install --no-deps -e . - name: Run tests run: | - python -m pytest -n auto - - upstream-dev: - name: upstream-dev - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - steps: - - uses: actions/checkout@v3 - - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v13 + python -m pytest -n auto --cov=./ --cov-report=xml + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 with: - environment-file: ci/upstream-dev-env.yml - environment-name: flox-tests - extra-specs: | - python="3.10" - - name: Run Tests - run: | - pytest -n 2 + file: ./coverage.xml + flags: unittests + env_vars: RUNNER_OS + name: codecov-umbrella + fail_ci_if_error: false xarray-groupby: name: xarray-groupby @@ -123,13 +113,14 @@ jobs: repository: 'pydata/xarray' fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v13 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/requirements/environment.yml environment-name: xarray-tests - cache-env: true - extra-specs: | - python="3.10" + init-shell: bash + cache-environment: true + create-args: | + python=3.10 - name: Install xarray run: | python -m pip install --no-deps . diff --git a/.github/workflows/testpypi-release.yaml b/.github/workflows/testpypi-release.yaml new file mode 100644 index 000000000..1ed0a4a56 --- /dev/null +++ b/.github/workflows/testpypi-release.yaml @@ -0,0 +1,89 @@ +name: Build and Upload to TestPyPI + +on: + push: + branches: + - "main" + pull_request: + types: [opened, reopened, synchronize, labeled] + branches: + - "*" + workflow_dispatch: + +# no need for concurrency limits + +jobs: + build-artifacts: + if: ${{ contains( github.event.pull_request.labels.*.name, 'test-build') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v4 + name: Install Python + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build twine + python -m pip install tomli tomli_w + + # - name: Disable local versions + # run: | + # python .github/workflows/configure-testpypi-version.py pyproject.toml + # git update-index --assume-unchanged pyproject.toml + # cat pyproject.toml + + - name: Build tarball and wheels + run: | + git clean -xdf + python -m build + + - name: Check built artifacts + run: | + python -m twine check --strict dist/* + if [ -f dist/flox-999.tar.gz ]; then + echo "❌ INVALID VERSION NUMBER" + exit 1 + else + echo "βœ… Looks good" + fi + + - uses: actions/upload-artifact@v3 + with: + name: releases + path: dist + + test-built-dist: + needs: build-artifacts + runs-on: ubuntu-latest + steps: + - uses: actions/setup-python@v4 + name: Install Python + with: + python-version: "3.10" + - uses: actions/download-artifact@v3 + with: + name: releases + path: dist + - name: List contents of built dist + run: | + ls -ltrh + ls -ltrh dist + + - name: Verify the built dist/wheel is valid + run: | + python -m pip install --upgrade pip + python -m pip install dist/flox*.whl + # python -m cf_xarray.scripts.print_versions + + # - name: Publish package to TestPyPI + # uses: pypa/gh-action-pypi-publish@v1.6.4 + # with: + # password: ${{ secrets.TESTPYPI_TOKEN }} + # repository_url: https://test.pypi.org/legacy/ + # verbose: true diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml new file mode 100644 index 000000000..0de92037f --- /dev/null +++ b/.github/workflows/upstream-dev-ci.yaml @@ -0,0 +1,64 @@ +name: CI Upstream +on: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, labeled] + branches: + - main + schedule: + - cron: "0 0 * * *" # Daily β€œAt 00:00” UTC + workflow_dispatch: # allows you to trigger the workflow run manually + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + upstream-dev: + name: upstream-dev + runs-on: ubuntu-latest + if: ${{ (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request') || github.event_name == 'workflow_dispatch' }} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for all branches and tags. + - name: Set environment variables + run: | + echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV + - name: Set up conda environment + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ci/upstream-dev-env.yml + environment-name: flox-tests + init-shell: bash + cache-environment: true + create-args: >- + python=${{ matrix.python-version }} + pytest-reportlog + - name: Install flox + run: | + python -m pip install --no-deps -e . + - name: Run Tests + if: success() + id: status + run: | + pytest -rf -n auto --cov=./ --cov-report=xml \ + --report-log output-${{ matrix.python-version }}-log.jsonl + - name: Generate and publish the report + if: | + failure() + && steps.status.outcome == 'failure' + && github.event_name == 'schedule' + && github.repository_owner == 'xarray-contrib' + uses: xarray-contrib/issue-from-pytest-log@v1 + with: + log-path: output-${{ matrix.python-version }}-log.jsonl diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b76d51d01..e9b1e9d6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,36 +2,55 @@ ci: autoupdate_schedule: quarterly repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: 'v0.0.276' + hooks: + - id: ruff + args: ["--fix"] + - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.4.0 hooks: + - id: check-yaml - id: trailing-whitespace - id: end-of-file-fixer - id: check-docstring-first - repo: https://github.com/psf/black - rev: 22.8.0 + rev: 23.3.0 hooks: - id: black - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.16 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-black + - mdformat-myst + + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.7.0 + hooks: + - id: nbqa-black + - id: nbqa-ruff + args: [--fix] + + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 hooks: - - id: flake8 + - id: nbstripout + args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + - repo: https://github.com/codespell-project/codespell + rev: v2.2.5 hooks: - - id: isort + - id: codespell + additional_dependencies: + - tomli - - repo: https://github.com/deathbeds/prenotebook - rev: f5bdb72a400f1a56fe88109936c83aa12cc349fa + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.13 hooks: - - id: prenotebook - args: - [ - '--keep-output', - '--keep-metadata', - '--keep-execution-count', - '--keep-empty', - ] + - id: validate-pyproject diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 8f1266ef8..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -include LICENSE -include README.md - -recursive-include flox *.py -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] diff --git a/README.md b/README.md index d99afaa9c..b7f986242 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/CI?logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![GitHub Workflow CI Status](https://img.shields.io/github/actions/workflow/status/xarray-contrib/flox/ci.yaml?branch=main&logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/xarray-contrib/flox/main.svg)](https://results.pre-commit.ci/latest/github/xarray-contrib/flox/main) [![image](https://img.shields.io/codecov/c/github/xarray-contrib/flox.svg?style=flat)](https://codecov.io/gh/xarray-contrib/flox) [![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest) @@ -14,10 +14,10 @@ This project explores strategies for fast GroupBy reductions with dask.array. It used to be called `dask_groupby` It was motivated by -1. Dask Dataframe GroupBy - [blogpost](https://blog.dask.org/2019/10/08/df-groupby) -2. [numpy_groupies](https://github.com/ml31415/numpy-groupies) in Xarray - [issue](https://github.com/pydata/xarray/issues/4473) +1. Dask Dataframe GroupBy + [blogpost](https://blog.dask.org/2019/10/08/df-groupby) +1. [numpy_groupies](https://github.com/ml31415/numpy-groupies) in Xarray + [issue](https://github.com/pydata/xarray/issues/4473) (See a [presentation](https://docs.google.com/presentation/d/1YubKrwu9zPHC_CzVBhvORuQBW-z148BvX3Ne8XcvWsQ/edit?usp=sharing) @@ -26,22 +26,23 @@ about this package, from the Pangeo Showcase). ## Acknowledgements This work was funded in part by + 1. NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System -Data in the Cloud" (PI J. Hamman, NCAR), -2. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and -3. [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). + Data in the Cloud" (PI J. Hamman, NCAR), +1. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and +1. [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). It was motivated by [very](https://github.com/pangeo-data/pangeo/issues/266) [very](https://github.com/pangeo-data/pangeo/issues/271) [many](https://github.com/dask/distributed/issues/2602) [discussions](https://github.com/pydata/xarray/issues/2237) in the [Pangeo](https://pangeo.io) community. ## API There are two main functions -1. `flox.groupby_reduce(dask_array, by_dask_array, "mean")` - "pure" dask array interface -1. `flox.xarray.xarray_reduce(xarray_object, by_dataarray, "mean")` - "pure" xarray interface; though [work is ongoing](https://github.com/pydata/xarray/pull/5734) to integrate this - package in xarray. +1. `flox.groupby_reduce(dask_array, by_dask_array, "mean")` + "pure" dask array interface +1. `flox.xarray.xarray_reduce(xarray_object, by_dataarray, "mean")` + "pure" xarray interface; though [work is ongoing](https://github.com/pydata/xarray/pull/5734) to integrate this + package in xarray. ## Implementation @@ -53,21 +54,21 @@ See [the documentation](https://flox.readthedocs.io/en/latest/implementation.htm It also allows you to specify a custom Aggregation (again inspired by dask.dataframe), though this might not be fully functional at the moment. See `aggregations.py` for examples. -``` python - mean = Aggregation( - # name used for dask tasks - name="mean", - # operation to use for pure-numpy inputs - numpy="mean", - # blockwise reduction - chunk=("sum", "count"), - # combine intermediate results: sum the sums, sum the counts - combine=("sum", "sum"), - # generate final result as sum / count - finalize=lambda sum_, count: sum_ / count, - # Used when "reindexing" at combine-time - fill_value=0, - # Used when any member of `expected_groups` is not found - final_fill_value=np.nan, - ) +```python +mean = Aggregation( + # name used for dask tasks + name="mean", + # operation to use for pure-numpy inputs + numpy="mean", + # blockwise reduction + chunk=("sum", "count"), + # combine intermediate results: sum the sums, sum the counts + combine=("sum", "sum"), + # generate final result as sum / count + finalize=lambda sum_, count: sum_ / count, + # Used when "reindexing" at combine-time + fill_value=0, + # Used when any member of `expected_groups` is not found + final_fill_value=np.nan, +) ``` diff --git a/asv_bench/benchmarks/README_CI.md b/asv_bench/benchmarks/README_CI.md index 9d86cc257..f306736ab 100644 --- a/asv_bench/benchmarks/README_CI.md +++ b/asv_bench/benchmarks/README_CI.md @@ -1,7 +1,9 @@ # Benchmark CI + + ## How it works @@ -10,39 +12,39 @@ The `asv` suite can be run for any PR on GitHub Actions (check workflow `.github We use `asv continuous` to run the job, which runs a relative performance measurement. This means that there's no state to be saved and that regressions are only caught in terms of performance ratio (absolute numbers are available but they are not useful since we do not use stable hardware over time). `asv continuous` will: -* Compile `scikit-image` for _both_ commits. We use `ccache` to speed up the process, and `mamba` is used to create the build environments. -* Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). -* Generate a report table with performance ratios: - * `ratio=1.0` -> performance didn't change. - * `ratio<1.0` -> PR made it slower. - * `ratio>1.0` -> PR made it faster. +- Compile `scikit-image` for _both_ commits. We use `ccache` to speed up the process, and `mamba` is used to create the build environments. +- Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). +- Generate a report table with performance ratios: + - `ratio=1.0` -> performance didn't change. + - `ratio<1.0` -> PR made it slower. + - `ratio>1.0` -> PR made it faster. Due to the sensitivity of the test, we cannot guarantee that false positives are not produced. In practice, values between `(0.7, 1.5)` are to be considered part of the measurement noise. When in doubt, running the benchmark suite one more time will provide more information about the test being a false positive or not. ## Running the benchmarks on GitHub Actions 1. On a PR, add the label `run-benchmark`. -2. The CI job will be started. Checks will appear in the usual dashboard panel above the comment box. -3. If more commits are added, the label checks will be grouped with the last commit checks _before_ you added the label. -4. Alternatively, you can always go to the `Actions` tab in the repo and [filter for `workflow:Benchmark`](https://github.com/scikit-image/scikit-image/actions?query=workflow%3ABenchmark). Your username will be assigned to the `actor` field, so you can also filter the results with that if you need it. +1. The CI job will be started. Checks will appear in the usual dashboard panel above the comment box. +1. If more commits are added, the label checks will be grouped with the last commit checks _before_ you added the label. +1. Alternatively, you can always go to the `Actions` tab in the repo and [filter for `workflow:Benchmark`](https://github.com/scikit-image/scikit-image/actions?query=workflow%3ABenchmark). Your username will be assigned to the `actor` field, so you can also filter the results with that if you need it. ## The artifacts The CI job will also generate an artifact. This is the `.asv/results` directory compressed in a zip file. Its contents include: -* `fv-xxxxx-xx/`. A directory for the machine that ran the suite. It contains three files: - * `.json`, `.json`: the benchmark results for each commit, with stats. - * `machine.json`: details about the hardware. -* `benchmarks.json`: metadata about the current benchmark suite. -* `benchmarks.log`: the CI logs for this run. -* This README. +- `fv-xxxxx-xx/`. A directory for the machine that ran the suite. It contains three files: + - `.json`, `.json`: the benchmark results for each commit, with stats. + - `machine.json`: details about the hardware. +- `benchmarks.json`: metadata about the current benchmark suite. +- `benchmarks.log`: the CI logs for this run. +- This README. ## Re-running the analysis Although the CI logs should be enough to get an idea of what happened (check the table at the end), one can use `asv` to run the analysis routines again. 1. Uncompress the artifact contents in the repo, under `.asv/results`. This is, you should see `.asv/results/benchmarks.log`, not `.asv/results/something_else/benchmarks.log`. Write down the machine directory name for later. -2. Run `asv show` to see your available results. You will see something like this: +1. Run `asv show` to see your available results. You will see something like this: ``` $> asv show @@ -115,8 +117,10 @@ To minimize the time required to run the full suite, we trimmed the parameter ma ```python from . import _skip_slow # this function is defined in benchmarks.__init__ + def time_something_slow(): pass + time_something.setup = _skip_slow ``` diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py index 2c19c881f..dbfbe8cd5 100644 --- a/asv_bench/benchmarks/cohorts.py +++ b/asv_bench/benchmarks/cohorts.py @@ -42,9 +42,9 @@ def track_num_layers(self): )[0] return len(result.dask.layers) - track_num_tasks.unit = "tasks" - track_num_tasks_optimized.unit = "tasks" - track_num_layers.unit = "layers" + track_num_tasks.unit = "tasks" # type: ignore[attr-defined] # Lazy + track_num_tasks_optimized.unit = "tasks" # type: ignore[attr-defined] # Lazy + track_num_layers.unit = "layers" # type: ignore[attr-defined] # Lazy class NWMMidwest(Cohorts): @@ -92,8 +92,8 @@ def setup(self, *args, **kwargs): by = (self.time.dt.month.values, self.time.dt.hour.values) ret = flox.core._factorize_multiple( by, - expected_groups=(pd.Index(np.arange(1, 13)), pd.Index(np.arange(1, 25))), - by_is_dask=False, + (pd.Index(np.arange(1, 13)), pd.Index(np.arange(1, 25))), + False, reindex=False, ) # Add one so the rechunk code is simpler and makes sense diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 2da0b1392..27600685f 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -1,3 +1,6 @@ +from functools import partial +from typing import Any + import numpy as np import flox @@ -7,26 +10,31 @@ N = 1000 +def _get_combine(combine): + if combine == "grouped": + return partial(flox.core._grouped_combine, engine="numpy") + else: + return partial(flox.core._simple_combine, reindex=False) + + class Combine: def setup(self, *args, **kwargs): raise NotImplementedError - @parameterized("kind", ("cohorts", "mapreduce")) - def time_combine(self, kind): - flox.core._grouped_combine( + @parameterized(("kind", "combine"), (("reindexed", "not_reindexed"), ("grouped", "simple"))) + def time_combine(self, kind, combine): + _get_combine(combine)( getattr(self, f"x_chunk_{kind}"), **self.kwargs, keepdims=True, - engine="numpy", ) - @parameterized("kind", ("cohorts", "mapreduce")) - def peakmem_combine(self, kind): - flox.core._grouped_combine( + @parameterized(("kind", "combine"), (("reindexed", "not_reindexed"), ("grouped", "simple"))) + def peakmem_combine(self, kind, combine): + _get_combine(combine)( getattr(self, f"x_chunk_{kind}"), **self.kwargs, keepdims=True, - engine="numpy", ) @@ -36,8 +44,8 @@ class Combine1d(Combine): this is for reducting along a single dimension """ - def setup(self, *args, **kwargs): - def construct_member(groups): + def setup(self, *args, **kwargs) -> None: + def construct_member(groups) -> dict[str, Any]: return { "groups": groups, "intermediates": [ @@ -47,7 +55,7 @@ def construct_member(groups): } # motivated by - self.x_chunk_mapreduce = [ + self.x_chunk_not_reindexed = [ construct_member(groups) for groups in [ np.array((1, 2, 3, 4)), @@ -57,5 +65,12 @@ def construct_member(groups): * 2 ] - self.x_chunk_cohorts = [construct_member(groups) for groups in [np.array((1, 2, 3, 4))] * 4] - self.kwargs = {"agg": flox.aggregations.mean, "axis": (3,)} + self.x_chunk_reindexed = [ + construct_member(groups) for groups in [np.array((1, 2, 3, 4))] * 4 + ] + self.kwargs = { + "agg": flox.aggregations._initialize_aggregation( + "sum", "float64", np.float64, 0, 0, {} + ), + "axis": (3,), + } diff --git a/asv_bench/benchmarks/reduce.py b/asv_bench/benchmarks/reduce.py index 0ed38e9ba..89c58e0bf 100644 --- a/asv_bench/benchmarks/reduce.py +++ b/asv_bench/benchmarks/reduce.py @@ -1,13 +1,15 @@ import numpy as np import numpy_groupies as npg +import pandas as pd import flox from . import parameterized N = 1000 -funcs = ["sum", "nansum", "mean", "nanmean", "max"] +funcs = ["sum", "nansum", "mean", "nanmean", "max", "var", "nanvar", "count"] engines = ["flox", "numpy", "numbagg"] +expected_groups = [None, pd.IntervalIndex.from_breaks([1, 2, 4])] class ChunkReduce: @@ -31,24 +33,26 @@ def setup(self, *args, **kwargs): raise NotImplementedError - @parameterized("func, engine", [funcs, engines]) - def time_reduce(self, func, engine): + @parameterized("func, engine, expected_groups", [funcs, engines, expected_groups]) + def time_reduce(self, func, engine, expected_groups): flox.groupby_reduce( self.array, self.labels, func=func, engine=engine, axis=self.axis, + expected_groups=expected_groups, ) - @parameterized("func, engine", [funcs, engines]) - def peakmem_reduce(self, func, engine): + @parameterized("func, engine, expected_groups", [funcs, engines, expected_groups]) + def peakmem_reduce(self, func, engine, expected_groups): flox.groupby_reduce( self.array, self.labels, func=func, engine=engine, axis=self.axis, + expected_groups=expected_groups, ) diff --git a/ci/docs.yml b/ci/docs.yml index 9cdfb38e5..c3359e4f1 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -7,7 +7,7 @@ dependencies: - xarray - numpy>=1.20 - numpydoc - - numpy_groupies + - numpy_groupies>=0.9.19 - toolz - matplotlib-base - myst-parser @@ -16,5 +16,6 @@ dependencies: - furo - ipykernel - jupyter + - sphinx-codeautolink - pip: - - git+https://github.com/xarray-contrib/flox + - -e .. diff --git a/ci/environment.yml b/ci/environment.yml index f50aa1b1c..cd96707ce 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,14 +9,16 @@ dependencies: - netcdf4 - pandas - numpy>=1.20 + - lxml # for mypy coverage report - matplotlib - pip - pytest - pytest-cov + - pytest-pretty - pytest-xdist - xarray - pre-commit - - numpy_groupies>=0.9.15 + - numpy_groupies>=0.9.19 - pooch - toolz - numba diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index 882c8d1fb..eeb075194 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -7,9 +7,10 @@ dependencies: - pip - pytest - pytest-cov + - pytest-pretty - pytest-xdist - numpy==1.20 - - numpy_groupies==0.9.15 + - numpy_groupies==0.9.19 - pandas - pooch - toolz diff --git a/ci/no-dask.yml b/ci/no-dask.yml index 31ce0ade3..8876a84cf 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -9,11 +9,12 @@ dependencies: - pip - pytest - pytest-cov + - pytest-pretty - pytest-xdist - xarray - numpydoc - pre-commit - - numpy_groupies>=0.9.15 + - numpy_groupies>=0.9.19 - pooch - toolz - numba diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index 25c777fa1..491d7ba8e 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -9,11 +9,12 @@ dependencies: - pip - pytest - pytest-cov + - pytest-pretty - pytest-xdist - dask-core - numpydoc - pre-commit - - numpy_groupies>=0.9.15 + - numpy_groupies>=0.9.19 - pooch - toolz - numba diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml index b44dc45e7..04fd7ce60 100644 --- a/ci/upstream-dev-env.yml +++ b/ci/upstream-dev-env.yml @@ -10,6 +10,7 @@ dependencies: - numba - pytest - pytest-cov + - pytest-pretty - pytest-xdist - pip - pip: diff --git a/codecov.yml b/codecov.yml index aa1da5f32..90c354ae2 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,6 +5,7 @@ codecov: comment: false ignore: + - 'benchmarks/*.py' - 'tests/*.py' - 'setup.py' diff --git a/docs/diagrams/new-blockwise-annotated.svg b/docs/diagrams/new-blockwise-annotated.svg new file mode 100644 index 000000000..15f3f2417 --- /dev/null +++ b/docs/diagrams/new-blockwise-annotated.svg @@ -0,0 +1,1185 @@ + +2022-11-14T16:09:56.206507image/svg+xmlMatplotlib v3.6.0, https://matplotlib.org/ diff --git a/docs/diagrams/new-blockwise.svg b/docs/diagrams/new-blockwise.svg new file mode 100644 index 000000000..3f310ad10 --- /dev/null +++ b/docs/diagrams/new-blockwise.svg @@ -0,0 +1,1563 @@ + + + + + + + + 2022-11-15T16:33:54.164631 + image/svg+xml + + + Matplotlib v3.6.0, https://matplotlib.orgdiff --git a/docs/diagrams/new-cohorts-annotated.svg b/docs/diagrams/new-cohorts-annotated.svg new file mode 100644 index 000000000..e6396328f --- /dev/null +++ b/docs/diagrams/new-cohorts-annotated.svg @@ -0,0 +1,1845 @@ + + + + diff --git a/docs/diagrams/new-cohorts.svg b/docs/diagrams/new-cohorts.svg new file mode 100644 index 000000000..fdbcf2050 --- /dev/null +++ b/docs/diagrams/new-cohorts.svg @@ -0,0 +1,2261 @@ + + + + + + + + 2022-11-14T16:28:28.725615 + image/svg+xml + + + Matplotlib v3.6.0, https://matplotlib.orgdiff --git a/docs/diagrams/new-map-reduce-reindex-False-annotated.svg b/docs/diagrams/new-map-reduce-reindex-False-annotated.svg new file mode 100644 index 000000000..73a45dc82 --- /dev/null +++ b/docs/diagrams/new-map-reduce-reindex-False-annotated.svg @@ -0,0 +1,1887 @@ + + + + diff --git a/docs/diagrams/new-map-reduce-reindex-False.svg b/docs/diagrams/new-map-reduce-reindex-False.svg new file mode 100644 index 000000000..5e03ccee8 --- /dev/null +++ b/docs/diagrams/new-map-reduce-reindex-False.svg @@ -0,0 +1,2382 @@ + + + + + + + + 2022-11-15T21:26:39.966187 + image/svg+xml + + + Matplotlib v3.6.0, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/diagrams/new-map-reduce-reindex-True-annotated.svg b/docs/diagrams/new-map-reduce-reindex-True-annotated.svg new file mode 100644 index 000000000..412eaa301 --- /dev/null +++ b/docs/diagrams/new-map-reduce-reindex-True-annotated.svg @@ -0,0 +1,1794 @@ + + + + diff --git a/docs/diagrams/new-map-reduce-reindex-True.svg b/docs/diagrams/new-map-reduce-reindex-True.svg new file mode 100644 index 000000000..97b615190 --- /dev/null +++ b/docs/diagrams/new-map-reduce-reindex-True.svg @@ -0,0 +1,2373 @@ + + + + + + + + 2022-11-15T16:09:02.384660 + image/svg+xml + + + Matplotlib v3.6.0, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/diagrams/new-split-apply-combine-annotated.svg b/docs/diagrams/new-split-apply-combine-annotated.svg new file mode 100644 index 000000000..fd38c9fef --- /dev/null +++ b/docs/diagrams/new-split-apply-combine-annotated.svg @@ -0,0 +1,2370 @@ + + + + diff --git a/docs/diagrams/split-apply-combine.svg b/docs/diagrams/split-apply-combine.svg new file mode 100644 index 000000000..a118d8fb3 --- /dev/null +++ b/docs/diagrams/split-apply-combine.svg @@ -0,0 +1,2342 @@ + + + + + + + + 2022-11-26T15:23:21.390398 + image/svg+xml + + + Matplotlib v3.6.2, https://matplotlib.orgdiff --git a/docs/source/aggregations.md b/docs/source/aggregations.md new file mode 100644 index 000000000..e6c10e4ba --- /dev/null +++ b/docs/source/aggregations.md @@ -0,0 +1,45 @@ +# Aggregations + +`flox` implements all common reductions provided by `numpy_groupies` in `aggregations.py`. Control this by passing +the `func` kwarg: + +- `"sum"`, `"nansum"` +- `"prod"`, `"nanprod"` +- `"count"` - number of non-NaN elements by group +- `"mean"`, `"nanmean"` +- `"var"`, `"nanvar"` +- `"std"`, `"nanstd"` +- `"argmin"` +- `"argmax"` +- `"first"` +- `"last"` + +```{tip} +We would like to add support for `cumsum`, `cumprod` ([issue](https://github.com/xarray-contrib/flox/issues/91)). Contributions are welcome! +``` + +## Custom Aggregations + +`flox` also allows you to specify a custom Aggregation (again inspired by dask.dataframe), +though this might not be fully functional at the moment. See `aggregations.py` for examples. + +See the ["Custom Aggregations"](user-stories/custom-aggregations.ipynb) user story for a more user-friendly example. + +```python +mean = Aggregation( + # name used for dask tasks + name="mean", + # operation to use for pure-numpy inputs + numpy="mean", + # blockwise reduction + chunk=("sum", "count"), + # combine intermediate results: sum the sums, sum the counts + combine=("sum", "sum"), + # generate final result as sum / count + finalize=lambda sum_, count: sum_ / count, + # Used when "reindexing" at combine-time + fill_value=0, + # Used when any member of `expected_groups` is not found + final_fill_value=np.nan, +) +``` diff --git a/docs/source/api.rst b/docs/source/api.rst index 5c0d21e63..b0d5e0aa7 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -9,7 +9,7 @@ Functions .. autosummary:: :toctree: generated/ - ~core.groupby_reduce + groupby_reduce xarray.xarray_reduce Rechunking @@ -18,8 +18,8 @@ Rechunking .. autosummary:: :toctree: generated/ - ~core.rechunk_for_blockwise - ~core.rechunk_for_cohorts + rechunk_for_blockwise + rechunk_for_cohorts xarray.rechunk_for_blockwise xarray.rechunk_for_cohorts @@ -30,7 +30,7 @@ Visualization :toctree: generated/ visualize.draw_mesh - visualize.visualize_groups + visualize.visualize_groups_1d visualize.visualize_cohorts_2d Aggregation Objects diff --git a/docs/source/arrays.md b/docs/source/arrays.md new file mode 100644 index 000000000..cf41c6fa1 --- /dev/null +++ b/docs/source/arrays.md @@ -0,0 +1,14 @@ +# Duck Array Support + +Aggregating over other array types will work if the array types supports the following methods, [ufunc.reduceat](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.reduceat.html) or [ufunc.at](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.at.html) + +| Reduction | `method="numpy"` | `method="flox"` | +| ------------------------------ | ---------------- | ----------------- | +| sum, nansum | bincount | add.reduceat | +| mean, nanmean | bincount | add.reduceat | +| var, nanvar | bincount | add.reduceat | +| std, nanstd | bincount | add.reduceat | +| count | bincount | add.reduceat | +| prod | multiply.at | multiply.reduceat | +| max, nanmax, argmax, nanargmax | maximum.at | maximum.reduceat | +| min, nanmin, argmin, nanargmin | minimum.at | minimum.reduceat | diff --git a/docs/source/conf.py b/docs/source/conf.py index 8790c9502..80412ba23 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # complexity documentation build configuration file, created by # sphinx-quickstart on Tue Jul 9 22:26:36 2013. @@ -40,11 +39,14 @@ "numpydoc", "sphinx.ext.napoleon", "myst_nb", + "sphinx_codeautolink", ] +codeautolink_concat_default = True + extlinks = { - "issue": ("https://github.com/xarray-contrib/flox/issues/%s", "GH#"), - "pr": ("https://github.com/xarray-contrib/flox/pull/%s", "GH#"), + "issue": ("https://github.com/xarray-contrib/flox/issues/%s", "GH#%s"), + "pr": ("https://github.com/xarray-contrib/flox/pull/%s", "PR#%s"), } templates_path = ["_templates"] @@ -61,6 +63,7 @@ # Myst_nb options nb_execution_excludepatterns = ["climatology-hourly.ipynb"] nb_execution_raise_on_error = True +nb_execution_mode = "cache" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -95,13 +98,34 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" +pygments_style = "igor" # -- Options for HTML output --------------------------------------------------- html_theme = "furo" +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +css_vars = { + "admonition-font-size": "0.9rem", + "font-size--small": "92%", + "font-size--small--2": "87.5%", +} +html_theme_options = dict( + sidebar_hide_name=True, + light_css_variables=css_vars, + dark_css_variables=css_vars, +) + +html_context = { + "github_user": "xarray-contrib", + "github_repo": "flox", + "github_version": "main", + "doc_path": "doc", +} + # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. @@ -174,7 +198,7 @@ "numpy": ("https://numpy.org/doc/stable", None), # "numba": ("https://numba.pydata.org/numba-doc/latest", None), "dask": ("https://docs.dask.org/en/latest", None), - "xarray": ("http://xarray.pydata.org/en/stable/", None), + "xarray": ("https://docs.xarray.dev/en/stable/", None), } autosummary_generate = True diff --git a/docs/source/custom.md b/docs/source/custom.md deleted file mode 100644 index dca975529..000000000 --- a/docs/source/custom.md +++ /dev/null @@ -1,26 +0,0 @@ -# Custom reductions - -`flox` implements all common reductions provided by `numpy_groupies` in `aggregations.py`. -It also allows you to specify a custom Aggregation (again inspired by dask.dataframe), -though this might not be fully functional at the moment. See `aggregations.py` for examples. - -See the ["Custom Aggregations"](user-stories/custom-aggregations.ipynb) user story for a more user-friendly example. - -```python - mean = Aggregation( - # name used for dask tasks - name="mean", - # operation to use for pure-numpy inputs - numpy="mean", - # blockwise reduction - chunk=("sum", "count"), - # combine intermediate results: sum the sums, sum the counts - combine=("sum", "sum"), - # generate final result as sum / count - finalize=lambda sum_, count: sum_ / count, - # Used when "reindexing" at combine-time - fill_value=0, - # Used when any member of `expected_groups` is not found - final_fill_value=np.nan, - ) -``` diff --git a/docs/source/engines.md b/docs/source/engines.md new file mode 100644 index 000000000..867979d13 --- /dev/null +++ b/docs/source/engines.md @@ -0,0 +1,26 @@ +(engines)= + +# Engines + +`flox` provides multiple options, using the `engine` kwarg, for computing the core GroupBy reduction on numpy or other array types other than dask. + +1. `engine="numpy"` wraps `numpy_groupies.aggregate_numpy`. This uses indexing tricks and functions like `np.bincount`, or the ufunc `.at` methods + (.e.g `np.maximum.at`) to provided reasonably performant aggregations. +1. `engine="numba"` wraps `numpy_groupies.aggregate_numba`. This uses `numba` kernels for the core aggregation. +1. `engine="flox"` uses the `ufunc.reduceat` method after first argsorting the array so that all group members occur sequentially. This was copied from + a [gist by Stephan Hoyer](https://gist.github.com/shoyer/f538ac78ae904c936844) + +See [](arrays) for more details. + +## Tradeoffs + +For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="flox"` *can* be faster. + +The reason is that `numpy_groupies` converts all groupby problems to a 1D problem, this can involve [some overhead](https://github.com/ml31415/numpy-groupies/pull/46). +It is possible to optimize this a bit in `flox` or `numpy_groupies`, but the work has not been done yet. +The advantage of `engine="numpy"` is that it tends to work for more array types, since it appears to be more common to implement `np.bincount`, and not `np.add.reduceat`. + +```{tip} +Other potential engines we could add are [`numbagg`](https://github.com/numbagg/numbagg) ([stalled PR here](https://github.com/xarray-contrib/flox/pull/72)) and [`datashader`](https://github.com/xarray-contrib/flox/issues/142). +Both use numba for high-performance aggregations. Contributions or discussion is very welcome! +``` diff --git a/docs/source/implementation.md b/docs/source/implementation.md index ae0db3539..f3a2a87f7 100644 --- a/docs/source/implementation.md +++ b/docs/source/implementation.md @@ -1,17 +1,31 @@ -# Algorithms +(algorithms)= -`flox` outsources the core GroupBy operation to the vectorized implementations in -[numpy_groupies](https://github.com/ml31415/numpy-groupies). Constructing -an efficient groupby reduction with dask is hard, and depends on how the -groups are distributed amongst the blocks of an array. `flox` implements 4 strategies for -grouped reductions, each is appropriate for a particular distribution of groups -among the blocks of a dask array. +# Parallel Algorithms -Switch between the various strategies by passing `method` to either {py:func}`flox.core.groupby_reduce` -or `xarray_reduce`. +`flox` outsources the core GroupBy operation to the vectorized implementations controlled by the +[`engine` kwarg](engines.md). Applying these implementations on a parallel array type like dask +can be hard. Performance strongly depends on how the groups are distributed amongst the blocks of an array. +`flox` implements 4 strategies for grouped reductions, each is appropriate for a particular distribution of groups +among the blocks of a dask array. Switch between the various strategies by passing `method` +and/or `reindex` to either {py:func}`flox.groupby_reduce` or {py:func}`flox.xarray.xarray_reduce`. -First we describe xarray's current strategy +Your options are: + +1. [`method="map-reduce"` with `reindex=False`](map-reindex-false) +1. [`method="map-reduce"` with `reindex=True`](map-reindex-True) +1. [`method="blockwise"`](method-blockwise) +1. [`method="cohorts"`](method-cohorts) + +The most appropriate strategy for your problem will depend on the chunking of your dataset, +and the distribution of group labels across those chunks. + +```{tip} +Currently these strategies are implemented for dask. We would like to generalize to other parallel array types +as appropriate (e.g. Ramba, cubed, arkouda). Please open an issue to discuss if you are interested. +``` + +(xarray-split)= ## Background: Xarray's current GroupBy strategy @@ -21,7 +35,13 @@ labels (i.e. you cannot use this strategy to group by a dask array). Schematically, this looks like (colors indicate group labels; separated groups of colors indicate different blocks of an array): -![xarray-current-strategy](../diagrams/split-reduce.png) + +```{image} ../diagrams/new-split-apply-combine-annotated.svg +--- +alt: xarray-current-strategy +width: 100% +--- +``` The first step is to extract all members of a group, which involves a *lot* of communication and is quite expensive (in dataframe terminology, this is a "shuffle"). @@ -30,89 +50,181 @@ big datasets. ## `method="map-reduce"` -The first idea is to use the "map-reduce" strategy (inspired by `dask.dataframe`). - ![map-reduce-strategy-schematic](/../diagrams/map-reduce.png) +The "map-reduce" strategy is inspired by `dask.dataframe.groupby`). The GroupBy reduction is first applied blockwise. Those intermediate results are combined by concatenating to form a new array which is then reduced -again. The combining of intermediate results uses dask\'s `_tree_reduce` +again. The combining of intermediate results uses dask's `_tree_reduce` till all group results are in one block. At that point the result is -\"finalized\" and returned to the user. +"finalized" and returned to the user. + +### General Tradeoffs + +1. This approach works well when either the initial blockwise reduction is effective, or if the + reduction at the first combine step is effective. Here "effective" means we have multiple members of a single + group in a block so the blockwise application of groupby-reduce actually reduces values and releases some memory. +1. One downside is that the final result will only have one chunk along the new group axis. +1. We have two choices for how to construct the intermediate arrays. See below. + +(map-reindex-True)= -*Tradeoffs*: -1. Allows grouping by a dask array so group labels need not be known at graph construction - time. -1. Works well when either the initial blockwise reduction is effective, or if the - reduction at the first combine step is effective. "effective" means we actually - reduce values and release some memory. +### `reindex=True` + +If we know all the group labels, we can do so right at the blockwise step (`reindex=True`). This matches `dask.array.histogram` and +`xhistogram`, where the bin edges, or group labels oof the output, are known. The downside is the potential of large memory use +if number of output groups is much larger than number of groups in a block. + +```{image} ../diagrams/new-map-reduce-reindex-True-annotated.svg +--- +alt: map-reduce-reindex-True-strategy-schematic +width: 100% +--- +``` + +(map-reindex-False)= + +### `reindex=False` + +We can `reindex` at the combine stage to groups present in the blocks being combined (`reindex=False`). This can limit memory use at the cost +of a performance reduction due to extra copies of the intermediate data during reindexing. + +```{image} ../diagrams/new-map-reduce-reindex-False-annotated.svg +--- +alt: map-reduce-reindex-True-strategy-schematic +width: 100% +--- +``` + +This approach allows grouping by a dask array so group labels can be discovered at compute time, similar to `dask.dataframe.groupby`. + +### Example + +For example, consider `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. +![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) +With `reindex=True`, each block will become 3x its original size at the blockwise step: input blocks have 4 timesteps while output block +has a value for all 12 months. One could use `reindex=False` to control memory usage but also see [`method="cohorts"`](method-cohorts) below. + +(method-blockwise)= ## `method="blockwise"` -One case where `"map-reduce"` doesn't work well is the case of "resampling" reductions. An -example here is resampling from daily frequency to monthly frequency data: `da.resample(time="M").mean()` +One case where `method="map-reduce"` doesn't work well is the case of "resampling" reductions. An +example here is resampling from daily frequency to monthly frequency data: `da.resample(time="M").mean()` For resampling type reductions, + 1. Group members occur sequentially (all days in January 2001 occur one after the other) -2. All groups are roughly equal length (31 days in January but 28 in most Februaries) -3. All members in a group are next to each other (if the time series is sorted, which it +1. All groups not of exactly equal length (31 days in January but 28 in most Februaries) +1. All members in a group are next to each other (if the time series is sorted, which it usually is). +1. Because there can be a large number of groups, concatenating results for all groups in a single chunk could be catastrophic. -In this case, it makes sense to use `dask.dataframe` resample strategy which is to rechunk +In this case, it makes sense to use `dask.dataframe` resample strategy which is to rechunk using {py:func}`flox.rechunk_for_blockwise` so that all members of a group are in a single block. Then, the groupby operation can be applied blockwise. -![blockwise-strategy-schematic](/../diagrams/blockwise.png) +```{image} ../diagrams/new-blockwise-annotated.svg +--- +alt: blockwise-strategy-schematic +width: 100% +--- +``` *Tradeoffs* + 1. Only works for certain groupings. 1. Group labels must be known at graph construction time, so this only works for numpy arrays 1. Currently the rechunking is only implemented for 1D arrays (being motivated by time resampling), but a nD generalization seems possible. -1. Works better when multiple groups are already in a single block; so that the intial +1. Only can use the `blockwise` strategy for grouping by `nD` arrays. +1. Works better when multiple groups are already in a single block; so that the initial rechunking only involves a small amount of communication. +(method-cohorts)= + ## `method="cohorts"` -We can combine all of the above ideas for cases where members from different groups tend to occur close to each other. +The `map-reduce` strategy is quite effective but can involve some unnecessary communication. It can be possible to exploit +patterns in how group labels are distributed across chunks (similar to `method="blockwise"` above). Two cases are illustrative: + +1. Groups labels can be *approximately-periodic*: e.g. `time.dayofyear` (period 365 or 366) or `time.month` (period 12). + Consider our earlier example, `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. + ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) + Because a chunksize of 4 evenly divides the number of groups (12) all we need to do is index out blocks + 0, 3, 7 and then apply the `"map-reduce"` strategy to form the final result for months Jan-Apr. Repeat for the + remaining groups of months (May-Aug; Sep-Dec) and then concatenate. + +1. Groups can be *spatially localized* like the blockwise case above, for example grouping by country administrative boundaries like + counties or districts. In this case, concatenating the result for the northwesternmost county or district and the southeasternmost + district can involve a lot of wasteful communication (again depending on chunking). + +For such cases, we can adapt xarray's shuffling or subsetting strategy by indexing out "cohorts" or group labels +that tend to occur next to each other. + +### A motivating example : time grouping + One example is the construction of "climatologies" which is a climate science term for something like `groupby("time.month")` ("monthly climatology") or `groupby("time.dayofyear")` ("daily climatology"). In these cases, -1. Groups occur sequentially (day 2 is always after day 1; and February is always after January) -2. Groups are approximately periodic (some years have 365 days and others have 366) -The idea here is to copy xarray's subsetting strategy but instead index out "cohorts" or group labels -that tend to occur next to each other. +1. Groups occur sequentially (day 2 is always after day 1; and February is always after January) +1. Groups are approximately periodic (some years have 365 days and others have 366) -Consider this example of monthly average data; where 4 months are present in a single block (i.e. chunksize=4) +Consider our earlier example, `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) -Because a chunksize of 4 evenly divides the number of groups (12) all we need to do is index out blocks +With `method="map-reduce", reindex=True`, each block will become 3x its original size at the blockwise step: input blocks have 4 timesteps while output block +has a value for all 12 months. Note that the blockwise groupby-reduction *does not reduce* the data since there is only one element in each +group. In addition, since `map-reduce` will make the final result have only one chunk of size 12 along the new `month` +dimension, the final result has chunk sizes 3x that of the input, which may not be ideal. + +However, because a chunksize of 4 evenly divides the number of groups (12) all we need to do is index out blocks 0, 3, 7 and then apply the `"map-reduce"` strategy to form the final result for months Jan-Apr. Repeat for the -remaining groups of months (May-Aug; Sep-Dec) and then concatenate. +remaining groups of months (May-Aug; Sep-Dec) and then concatenate. This is the essence of `method="cohorts"` + +### Summary + +We can generalize this idea for more complicated problems (inspired by the `split_out`kwarg in `dask.dataframe.groupby`) +We first apply the groupby-reduction blockwise, then split and reindex blocks to create a new array with which we complete the reduction +using `map-reduce`. Because the split or shuffle step occurs after the blockwise reduction, we *sometimes* communicate a significantly smaller +amount of data than if we split or shuffled the input array. + +```{image} /../diagrams/new-cohorts-annotated.svg +--- +alt: cohorts-strategy-schematic +width: 100% +--- +``` + +### Tradeoffs + +1. Group labels must be known at graph construction time, so this only works for numpy arrays. +1. This does require more tasks and a more complicated graph, but the communication overhead can be significantly lower. +1. The detection of "cohorts" is currently slow but could be improved. +1. The extra effort of detecting cohorts and mul;tiple copying of intermediate blocks may be worthwhile only if the chunk sizes are small + relative to the approximate period of group labels, or small relative to the size of spatially localized groups. + +### Example : sensitivity to chunking + +One annoyance is that if the chunksize doesn't evenly divide the number of groups, we still end up splitting a number of chunks. +Consider our earlier example, `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. +![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) `flox` can find these cohorts, below it identifies the cohorts with labels `1,2,3,4`; `5,6,7,8`, and `9,10,11,12`. -``` python ->>> flox.core.find_group_cohorts(labels, array.chunks[-1])) + +```python +>>> flox.find_group_cohorts(labels, array.chunks[-1]).values() [[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] # 3 cohorts ``` -For each cohort, it counts the number of blocks that need to be reduced. If `1` then it applies the reduction blockwise. -If > 1; then it uses `"map-reduce"`. -One annoyance is that if the chunksize doesn't evenly divide the number of groups, we still end up splitting a number of chunks. -For example, when `chunksize=5` +Now consider `chunksize=5`. ![cohorts-schematic](/../diagrams/cohorts-month-chunk5.png) -``` python ->>> flox.core.find_group_cohorts(labels, array.chunks[-1])) +```python +>>> flox.core.find_group_cohorts(labels, array.chunks[-1]).values() [[1], [2, 3], [4, 5], [6], [7, 8], [9, 10], [11], [12]] # 8 cohorts ``` -We find 8 cohorts (note the original xarray strategy is equivalent to constructing 12 cohorts). -It's possible that some initial rechunking makes the situation better (just rechunk from 5-4), but it isn't an obvious improvement. +We find 8 cohorts (note the original xarray strategy is equivalent to constructing 12 cohorts). +In this case, it seems to better to rechunk to a size of `4` along `time`. If you have ideas for improving this case, please open an issue. -*Tradeoffs* -1. Generalizes well; when there's exactly one groups per chunk, this replicates Xarray's - strategy which is optimal. For resampling type reductions, as long as the array - is chunked appropriately ({py:func}`flox.core.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`), `method="cohorts"` is equivalent to `method="blockwise"`! -1. Group labels must be known at graph construction time, so this only works for numpy arrays -1. Currenltly implemented for grouping by 1D arrays. An nD generalization seems possible, - but hard? +### Example : spatial grouping diff --git a/docs/source/index.md b/docs/source/index.md index cf4c5c3ef..9fdd470c2 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,54 +1,73 @@ # flox: fast & furious GroupBy reductions for `dask.array` -## Overview - -[![GitHub Workflow CI Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/CI?logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) -[![GitHub Workflow Code Style Status](https://img.shields.io/github/workflow/status/xarray-contrib/flox/code-style?label=Code%20Style&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![GitHub Workflow CI Status](https://img.shields.io/github/actions/workflow/status/xarray-contrib/flox/ci.yaml?branch=main&logo=github&style=flat)](https://github.com/xarray-contrib/flox/actions) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/xarray-contrib/flox/main.svg)](https://results.pre-commit.ci/latest/github/xarray-contrib/flox/main) [![image](https://img.shields.io/codecov/c/github/xarray-contrib/flox.svg?style=flat)](https://codecov.io/gh/xarray-contrib/flox) +[![Documentation Status](https://readthedocs.org/projects/flox/badge/?version=latest)](https://flox.readthedocs.io/en/latest/?badge=latest) + [![PyPI](https://img.shields.io/pypi/v/flox.svg?style=flat)](https://pypi.org/project/flox/) [![Conda-forge](https://img.shields.io/conda/vn/conda-forge/flox.svg?style=flat)](https://anaconda.org/conda-forge/flox) -This project explores strategies for fast GroupBy reductions with dask.array. It used to be called `dask_groupby`. It was motivated by +[![NASA-80NSSC18M0156](https://img.shields.io/badge/NASA-80NSSC18M0156-blue)](https://earthdata.nasa.gov/esds/competitive-programs/access/pangeo-ml) +[![NASA-80NSSC22K0345](https://img.shields.io/badge/NASA-80NSSC22K0345-blue)](https://science.nasa.gov/open-science-overview) -1. Dask Dataframe GroupBy - [blogpost](https://blog.dask.org/2019/10/08/df-groupby) -2. numpy_groupies in Xarray - [issue](https://github.com/pydata/xarray/issues/4473) +## Overview + +`flox` mainly provides strategies for fast GroupBy reductions with dask.array. `flox` uses the MapReduce paradigm (or a "tree reduction") +to run the GroupBy operation in a parallel-native way totally avoiding a sort or shuffle operation. It was motivated by + +1. Dask Dataframe GroupBy + [blogpost](https://blog.dask.org/2019/10/08/df-groupby) +1. numpy_groupies in Xarray + [issue](https://github.com/pydata/xarray/issues/4473) See a presentation ([video](https://discourse.pangeo.io/t/november-17-2021-flox-fast-furious-groupby-reductions-with-dask-at-pangeo-scale/2016), [slides](https://docs.google.com/presentation/d/1YubKrwu9zPHC_CzVBhvORuQBW-z148BvX3Ne8XcvWsQ/edit?usp=sharing)) about this package, from the Pangeo Showcase. +## Why flox? + +1. {py:func}`flox.groupby_reduce` [wraps](engines.md) the `numpy-groupies` package for performant Groupby reductions on nD arrays. +1. {py:func}`flox.groupby_reduce` provides [parallel-friendly strategies](implementation.md) for GroupBy reductions by wrapping `numpy-groupies` for dask arrays. +1. `flox` [integrates with xarray](xarray.md) to provide more performant Groupby and Resampling operations. +1. {py:func}`flox.xarray.xarray_reduce` [extends](xarray.md) Xarray's GroupBy operations allowing lazy grouping by dask arrays, grouping by multiple arrays, + as well as combining categorical grouping and histogram-style binning operations using multiple variables. +1. `flox` also provides utility functions for rechunking both dask arrays and Xarray objects along a single dimension using the group labels as a guide: + 1. To rechunk for blockwise operations: {py:func}`flox.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`. + 1. To rechunk so that "cohorts", or groups of labels, tend to occur in the same chunks: {py:func}`flox.rechunk_for_cohorts`, {py:func}`flox.xarray.rechunk_for_cohorts`. + ## Installing -``` shell +```shell $ pip install flox ``` -``` shell +```shell $ conda install -c conda-forge flox ``` -## API +## Acknowledgements -There are two main functions -1. {py:func}`flox.core.groupby_reduce` - "pure" dask array interface -1. {py:func}`flox.xarray.xarray_reduce` - "pure" xarray interface; though [work is ongoing](https://github.com/pydata/xarray/pull/5734) to integrate this - package in xarray. +This work was funded in part by -## Acknowledgements +1. NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System + Data in the Cloud" (PI J. Hamman), +1. NASA-OSTFL 80NSSC22K0345 "Enhancing analysis of NASA data with the open-source Python Xarray Library" (PIs Scott Henderson, University of Washington; + Deepak Cherian, NCAR; Jessica Scheick, University of New Hampshire), and +1. [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). -This work was funded in part by NASA-ACCESS 80NSSC18M0156 "Community tools for analysis of NASA Earth Observing System -Data in the Cloud" (PI J. Hamman), and [NCAR's Earth System Data Science Initiative](https://ncar.github.io/esds/). It was motivated by many discussions in the [Pangeo](https://pangeo.io) community. ## Contents + ```{eval-rst} .. toctree:: :maxdepth: 1 + intro.md + aggregations.md + engines.md + arrays.md implementation.md - custom.md - api.rst + xarray.md user-stories.md + api.rst ``` diff --git a/docs/source/intro.md b/docs/source/intro.md new file mode 100644 index 000000000..4660b0cf9 --- /dev/null +++ b/docs/source/intro.md @@ -0,0 +1,186 @@ +--- +jupytext: + text_representation: + format_name: myst +kernelspec: + display_name: Python 3 + name: python3 +--- + +```{eval-rst} +.. currentmodule:: flox +``` + +# 10 minutes to flox + +## GroupBy single variable + +```{code-cell} +import numpy as np +import xarray as xr + +from flox.xarray import xarray_reduce + +labels = xr.DataArray( + [1, 2, 3, 1, 2, 3, 0, 0, 0], + dims="x", + name="label", +) +labels +``` + +### With numpy + +```{code-cell} +da = xr.DataArray( + np.ones((9,)), dims="x", name="array" +) +``` + +Apply the reduction using {py:func}`flox.xarray.xarray_reduce` specifying the reduction operation in `func` + +```{code-cell} +xarray_reduce(da, labels, func="sum") +``` + +### With dask + +Let's first chunk `da` and `labels` + +```{code-cell} +da_chunked = da.chunk(x=2) +labels_chunked = labels.chunk(x=3) +``` + +Grouping a dask array by a numpy array is unchanged + +```{code-cell} +xarray_reduce(da_chunked, labels, func="sum") +``` + +When grouping **by** a dask array, we need to specify the "expected group labels" on the output so we can construct the result DataArray. +Without the `expected_groups` kwarg, an error is raised + +```{code-cell} +--- +tags: [raises-exception] +--- +xarray_reduce(da_chunked, labels_chunked, func="sum") +``` + +Now we specify `expected_groups`: + +```{code-cell} +dask_result = xarray_reduce( + da_chunked, labels_chunked, func="sum", expected_groups=[0, 1, 2, 3], +) +dask_result +``` + +Note that any group labels not present in `expected_groups` will be ignored. +You can also provide `expected_groups` for the pure numpy GroupBy. + +```{code-cell} +numpy_result = xarray_reduce( + da, labels, func="sum", expected_groups=[0, 1, 2, 3], +) +numpy_result +``` + +The two are identical: + +```{code-cell} +numpy_result.identical(dask_result) +``` + +## Binning by a single variable + +For binning, specify the bin edges in `expected_groups` using {py:class}`pandas.IntervalIndex`: + +```{code-cell} +import pandas as pd + +xarray_reduce( + da, + labels, + func="sum", + expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6]), +) +``` + +Similarly for dask inputs + +```{code-cell} +xarray_reduce( + da_chunked, + labels_chunked, + func="sum", + expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6]), +) +``` + +For more control over the binning (which edge is closed), pass the appropriate kwarg to {py:class}`pandas.IntervalIndex`: + +```{code-cell} +xarray_reduce( + da_chunked, + labels_chunked, + func="sum", + expected_groups=pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5, 6], closed="left"), +) +``` + +## Grouping by multiple variables + +```{code-cell} +arr = np.ones((4, 12)) +labels1 = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) +labels2 = np.array([1, 2, 2, 1]) + +da = xr.DataArray( + arr, dims=("x", "y"), coords={"labels2": ("x", labels2), "labels1": ("y", labels1)} +) +da +``` + +To group by multiple variables simply pass them as `*args`: + +```{code-cell} +xarray_reduce(da, "labels1", "labels2", func="sum") +``` + +## Histogramming (Binning by multiple variables) + +An unweighted histogram is simply a groupby multiple variables with count. + +```{code-cell} python +arr = np.ones((4, 12)) +labels1 = np.array(np.linspace(0, 10, 12)) +labels2 = np.array([1, 2, 2, 1]) + +da = xr.DataArray( + arr, dims=("x", "y"), coords={"labels2": ("x", labels2), "labels1": ("y", labels1)} +) +da +``` + +Specify bins in `expected_groups` + +```{code-cell} python +xarray_reduce( + da, + "labels1", + "labels2", + func="count", + expected_groups=( + pd.IntervalIndex.from_breaks([-0.5, 4.5, 6.5, 8.9]), # labels1 + pd.IntervalIndex.from_breaks([0.5, 1.5, 1.9]), # labels2 + ), +) +``` + +## Resampling + +Use the xarray interface i.e. `da.resample(time="M").mean()`. + +Optionally pass [`method="blockwise"`](method-blockwise): `da.resample(time="M").mean(method="blockwise")` diff --git a/docs/source/user-stories.md b/docs/source/user-stories.md index 2f190c63b..22b37939e 100644 --- a/docs/source/user-stories.md +++ b/docs/source/user-stories.md @@ -1,4 +1,4 @@ -# User Stories +# Tricks & Stories ```{eval-rst} .. toctree:: diff --git a/docs/source/user-stories/climatology-hourly.ipynb b/docs/source/user-stories/climatology-hourly.ipynb index 5fef37851..b17cdc1aa 100644 --- a/docs/source/user-stories/climatology-hourly.ipynb +++ b/docs/source/user-stories/climatology-hourly.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "727f490e-906a-4537-ac5e-3c67985cd6d5", "metadata": {}, "outputs": [ @@ -55,7 +55,14 @@ } ], "source": [ + "import dask.array\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", "from dask.distributed import Client\n", + "from distributed import performance_report\n", + "\n", + "import flox.xarray\n", "\n", "# Setup a local cluster.\n", "# By default this sets up 1 worker per core\n", @@ -65,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "6085684f-cafa-450c-8448-d5c9c1cbb55f", "metadata": {}, "outputs": [ @@ -84,12 +91,7 @@ "source": [ "%load_ext watermark\n", "\n", - "import time\n", "\n", - "import dask.array\n", - "import numpy as np\n", - "import pandas as pd\n", - "import xarray as xr\n", "\n", "%watermark -iv" ] @@ -104,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "2aa66559-b2dd-4b46-b32b-f1ce2270c3de", "metadata": {}, "outputs": [ @@ -614,7 +616,7 @@ " tp (time, latitude, longitude) float32 dask.array" ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -624,9 +626,7 @@ " {\n", " \"tp\": (\n", " (\"time\", \"latitude\", \"longitude\"),\n", - " dask.array.ones(\n", - " (8760, 721, 1440), chunks=(744, 50, 1440), dtype=np.float32\n", - " ),\n", + " dask.array.ones((8760, 721, 1440), chunks=(744, 50, 1440), dtype=np.float32),\n", " )\n", " },\n", " coords={\"time\": pd.date_range(\"2021-01-01\", \"2021-12-31 23:59\", freq=\"H\")},\n", @@ -644,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "ecc77698-5879-4b7c-ad97-891fb104d295", "metadata": {}, "outputs": [ @@ -1162,7 +1162,7 @@ "Dimensions without coordinates: latitude, longitude" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -1181,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "0a3da8e5-863a-4602-9176-0a9adc689563", "metadata": {}, "outputs": [ @@ -1663,31 +1663,27 @@ "Dimensions without coordinates: latitude, longitude" ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import flox.xarray\n", - "\n", "hourly = flox.xarray.xarray_reduce(ds.tp, ds.time.dt.hour, func=\"mean\")\n", "hourly" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "8aa1a641-1ce1-4264-96dc-d11bb1d4ab57", "metadata": {}, "outputs": [], - "source": [ - "from distributed import performance_report" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "e37c5aa2-c77a-4d87-8db4-5052c675c42d", "metadata": {}, "outputs": [], @@ -1709,11 +1705,7 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, + "keep_output": true, "language_info": { "codemirror_mode": { "name": "ipython", @@ -1723,266 +1715,10 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" + "pygments_lexer": "ipython3" }, "mystnb": { "execution_mode": "off" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "02bf99615dae4b7b9b2aac23acccc828": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "06093fd4131d42749c5d32b149d36cbe": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "23d59a300993407dabc70ab6282460ba": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "2415e8902a9e4087827ebb98df678028": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "AccordionModel", - "state": { - "_titles": { - "0": "Manual Scaling", - "1": "Adaptive Scaling" - }, - "children": [ - "IPY_MODEL_fce763ee43d44833bfb73dc3ca34d18a", - "IPY_MODEL_5c81a669ef8d4e13921d9b6f3218fbe1" - ], - "layout": "IPY_MODEL_6e424b71aff3457baae281ef596e294a", - "selected_index": null - } - }, - "38ba388c3c144dd4af2d9487f9623f31": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonStyleModel", - "state": {} - }, - "3eb1fff965764a2aa70f35e59754a6e5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "5bbcffef6cc04a6f893e5e8be12de433": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_02bf99615dae4b7b9b2aac23acccc828", - "style": "IPY_MODEL_726a881ed9644cd988b37c70dbe1957b", - "value": "
\n
\n
\n
\n

LocalCluster

\n

2b898a97

\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n \n
\n Dashboard: http://127.0.0.1:51613/status\n \n Workers: 4\n
\n Total threads: 4\n \n Total memory: 8.00 GiB\n
Status: runningUsing processes: True
\n\n
\n \n

Scheduler Info

\n
\n\n
\n
\n
\n
\n

Scheduler

\n

Scheduler-e88043e1-f96c-408b-828a-6133edf9383e

\n \n \n \n \n \n \n \n \n \n \n \n \n \n
\n Comm: tcp://127.0.0.1:51614\n \n Workers: 4\n
\n Dashboard: http://127.0.0.1:51613/status\n \n Total threads: 4\n
\n Started: 8 minutes ago\n \n Total memory: 8.00 GiB\n
\n
\n
\n\n
\n \n

Workers

\n
\n\n \n
\n
\n
\n
\n \n

Worker: 0

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51625\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51632/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51618\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-sha7f1ls\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 1

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51626\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51639/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51619\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-o21y4jdf\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 2

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51631\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51640/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51617\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-ll8d_5ds\n
\n
\n
\n
\n \n
\n
\n
\n
\n \n

Worker: 3

\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n
\n Comm: tcp://127.0.0.1:51628\n \n Total threads: 1\n
\n Dashboard: http://127.0.0.1:51638/status\n \n Memory: 2.00 GiB\n
\n Nanny: tcp://127.0.0.1:51620\n
\n Local directory: /Users/dcherian/work/python/flox/docs/source/user-stories/dask-worker-space/worker-t_4kkml1\n
\n
\n
\n
\n \n\n
\n
\n\n
\n
\n
" - } - }, - "5c81a669ef8d4e13921d9b6f3218fbe1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_ebd6fbdbb6b149b8b71bc1adf4f98e8f", - "IPY_MODEL_e09755df3cd34c65adef354b74764926", - "IPY_MODEL_6d0480d5ac4243728c2e219060c4d160" - ], - "layout": "IPY_MODEL_b66ab102b9fc4ef69e3eb1a5a78f3211" - } - }, - "6a04758b6a5e4bbf8df42688a433ce7c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "6d0480d5ac4243728c2e219060c4d160": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonModel", - "state": { - "description": "Adapt", - "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", - "style": "IPY_MODEL_38ba388c3c144dd4af2d9487f9623f31" - } - }, - "6e424b71aff3457baae281ef596e294a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "min_width": "500px" - } - }, - "726a881ed9644cd988b37c70dbe1957b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7d9c070ca8c8451086d0d8f977c3769f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "91ba64ec63f74f7dbe2aa552c53368ee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_987a6ab24d514f3f91c40bce527c23cc", - "IPY_MODEL_2415e8902a9e4087827ebb98df678028" - ], - "layout": "IPY_MODEL_6a04758b6a5e4bbf8df42688a433ce7c" - } - }, - "987a6ab24d514f3f91c40bce527c23cc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_9a05be3122214bcebda0395a1685bd18", - "style": "IPY_MODEL_d427f7f692f947b69e66bdf3a799ffe5", - "value": "\n \n \n \n
Scaling mode: Manual
Workers: 4
\n " - } - }, - "99676d05a3504002a88bbcfa7dca2ab7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonStyleModel", - "state": {} - }, - "9a05be3122214bcebda0395a1685bd18": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a6fbe3d8ce864b40ac7ff3ed9cc28ee2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "IntTextModel", - "state": { - "description": "Workers", - "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", - "step": 1, - "style": "IPY_MODEL_3eb1fff965764a2aa70f35e59754a6e5" - } - }, - "b30b5b56bbf24eea8658df01925e7cb9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "b66ab102b9fc4ef69e3eb1a5a78f3211": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "d427f7f692f947b69e66bdf3a799ffe5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e09755df3cd34c65adef354b74764926": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "IntTextModel", - "state": { - "description": "Maximum", - "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", - "step": 1, - "style": "IPY_MODEL_23d59a300993407dabc70ab6282460ba" - } - }, - "ebd6fbdbb6b149b8b71bc1adf4f98e8f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "IntTextModel", - "state": { - "description": "Minimum", - "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", - "step": 1, - "style": "IPY_MODEL_7d9c070ca8c8451086d0d8f977c3769f" - } - }, - "f3fab32037ec4a8887a9d61036d93eed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ButtonModel", - "state": { - "description": "Scale", - "layout": "IPY_MODEL_f47c6dced9324bfca691f320e4697911", - "style": "IPY_MODEL_99676d05a3504002a88bbcfa7dca2ab7" - } - }, - "f47c6dced9324bfca691f320e4697911": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "width": "150px" - } - }, - "fc1dd8438def4d75acee8602c544248c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "TabModel", - "state": { - "_titles": { - "0": "Status", - "1": "Scaling" - }, - "children": [ - "IPY_MODEL_5bbcffef6cc04a6f893e5e8be12de433", - "IPY_MODEL_91ba64ec63f74f7dbe2aa552c53368ee" - ], - "layout": "IPY_MODEL_b30b5b56bbf24eea8658df01925e7cb9" - } - }, - "fce763ee43d44833bfb73dc3ca34d18a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_a6fbe3d8ce864b40ac7ff3ed9cc28ee2", - "IPY_MODEL_f3fab32037ec4a8887a9d61036d93eed" - ], - "layout": "IPY_MODEL_06093fd4131d42749c5d32b149d36cbe" - } - } - }, - "version_major": 2, - "version_minor": 0 - } } }, "nbformat": 4, diff --git a/docs/source/user-stories/climatology.ipynb b/docs/source/user-stories/climatology.ipynb index 3fd7ae55d..bc867eda4 100644 --- a/docs/source/user-stories/climatology.ipynb +++ b/docs/source/user-stories/climatology.ipynb @@ -22,12 +22,13 @@ "outputs": [], "source": [ "import dask.array\n", - "import flox\n", - "import flox.xarray\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "import xarray as xr" + "import xarray as xr\n", + "\n", + "import flox\n", + "import flox.xarray" ] }, { @@ -49,9 +50,7 @@ "oisst = xr.DataArray(\n", " dask.array.ones((14532, 720, 1440), chunks=(20, -1, -1)),\n", " dims=(\"time\", \"lat\", \"lon\"),\n", - " coords={\n", - " \"time\": pd.date_range(\"1981-09-01 12:00\", \"2021-06-14 12:00\", freq=\"D\")\n", - " },\n", + " coords={\"time\": pd.date_range(\"1981-09-01 12:00\", \"2021-06-14 12:00\", freq=\"D\")},\n", " name=\"sst\",\n", ")\n", "oisst" @@ -177,7 +176,7 @@ "flox.core.find_group_cohorts(\n", " labels=oisst.time.dt.dayofyear.data,\n", " chunks=(oisst.chunksizes[\"time\"],),\n", - ")" + ").values()" ] }, { @@ -300,7 +299,7 @@ "flox.core.find_group_cohorts(\n", " labels=rechunked.time.dt.dayofyear.data,\n", " chunks=(rechunked.chunksizes[\"time\"],),\n", - ")" + ").values()" ] }, { @@ -319,9 +318,7 @@ "metadata": {}, "outputs": [], "source": [ - "flox.xarray.xarray_reduce(\n", - " rechunked, rechunked.time.dt.dayofyear, func=\"mean\", method=\"cohorts\"\n", - ")" + "flox.xarray.xarray_reduce(rechunked, rechunked.time.dt.dayofyear, func=\"mean\", method=\"cohorts\")" ] }, { @@ -362,11 +359,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -376,15 +368,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.1" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/source/user-stories/custom-aggregations.ipynb b/docs/source/user-stories/custom-aggregations.ipynb index d76c20f23..7b4167b98 100644 --- a/docs/source/user-stories/custom-aggregations.ipynb +++ b/docs/source/user-stories/custom-aggregations.ipynb @@ -21,393 +21,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "8c6fcc42-b081-44fa-acf7-a95ec4ed75d2", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.DataArray 'label' (profile: 28397)>\n",
-       "array([3, 2, 4, ..., 4, 1, 6])\n",
-       "Coordinates:\n",
-       "    lat      (profile) float64 -37.18 11.79 -40.99 17.31 ... -20.21 9.844 63.56\n",
-       "    lon      (profile) float64 130.9 53.86 66.59 161.0 ... -140.2 -30.68 -44.73\n",
-       "Dimensions without coordinates: profile
" - ], - "text/plain": [ - "\n", - "array([3, 2, 4, ..., 4, 1, 6])\n", - "Coordinates:\n", - " lat (profile) float64 -37.18 11.79 -40.99 17.31 ... -20.21 9.844 63.56\n", - " lon (profile) float64 130.9 53.86 66.59 161.0 ... -140.2 -30.68 -44.73\n", - "Dimensions without coordinates: profile" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import flox.xarray\n", - "import matplotlib.pyplot as plt\n", "import numpy as np\n", + "import numpy_groupies as npg\n", "import xarray as xr\n", "\n", + "import flox.xarray\n", + "from flox import Aggregation\n", + "from flox.aggregations import mean\n", + "\n", "# define latitude and longitude bins\n", "binsize = 1.0 # 1Β°x1Β° bins\n", "lon_min, lon_max, lat_min, lat_max = [-180, 180, -65, 65]\n", @@ -447,37 +73,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c0a7f29f-311c-41fd-b03b-33ba7ffccfc6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "image/png": { - "height": 266, - "width": 378 - }, - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "binned_mean = flox.xarray.xarray_reduce(\n", " da,\n", @@ -507,36 +106,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "574b93ef-dd73-4a98-bd53-69119d5d97c0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/plain": [ - "mean, fill: [dict_values([, (0, 0)])], dtype: {'mean': , 'intermediate': (None, )}\n", - "chunk: ('sum', 'nanlen')\n", - "combine: ('sum', 'sum')\n", - "aggregate: sum\n", - "finalize: at 0x10966c670>\n", - "min_count: None" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from flox.aggregations import mean\n", - "\n", "print(type(mean))\n", "mean" ] @@ -622,18 +196,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "05b8a1e5-e865-4b25-8540-df5aa6c218e9", "metadata": {}, "outputs": [], "source": [ - "import numpy_groupies as npg\n", - "\n", - "\n", - "def grouped_median(\n", - " group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None\n", - "):\n", - "\n", + "def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):\n", " return npg.aggregate_numpy.aggregate(\n", " group_idx,\n", " array,\n", @@ -655,29 +223,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "07c0fc82-c77b-4472-9de7-3c4a7cf3e07e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "median, fill: [dict_values([, (-1,)])], dtype: {'median': None, 'intermediate': (None,)}\n", - "chunk: (None,)\n", - "combine: (None,)\n", - "aggregate: None\n", - "finalize: . at 0x10ec78550>\n", - "min_count: None" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from flox import Aggregation\n", - "\n", "agg_median = Aggregation(\n", " name=\"median\",\n", " numpy=grouped_median,\n", @@ -698,529 +248,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "df85a390-99dd-432f-b248-6160935deb52", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.DataArray 'label' (lat_bins: 129, lon_bins: 359)>\n",
-       "array([[3. , 1. , nan, ..., nan, nan, nan],\n",
-       "       [nan, 4. , 2.5, ..., nan, 0. , 3.5],\n",
-       "       [nan, 2. , nan, ..., 5. , nan, nan],\n",
-       "       ...,\n",
-       "       [nan, 1. , 5. , ..., 2. , nan, 6. ],\n",
-       "       [nan, 3. , 5. , ..., 1. , 2. , 1. ],\n",
-       "       [2. , 6. , 5. , ..., nan, 5. , 6. ]])\n",
-       "Coordinates:\n",
-       "  * lat_bins  (lat_bins) object (-65.0, -64.0] (-64.0, -63.0] ... (63.0, 64.0]\n",
-       "  * lon_bins  (lon_bins) object (-180.0, -179.0] ... (178.0, 179.0]
" - ], - "text/plain": [ - "\n", - "array([[3. , 1. , nan, ..., nan, nan, nan],\n", - " [nan, 4. , 2.5, ..., nan, 0. , 3.5],\n", - " [nan, 2. , nan, ..., 5. , nan, nan],\n", - " ...,\n", - " [nan, 1. , 5. , ..., 2. , nan, 6. ],\n", - " [nan, 3. , 5. , ..., 1. , 2. , 1. ],\n", - " [2. , 6. , 5. , ..., nan, 5. , 6. ]])\n", - "Coordinates:\n", - " * lat_bins (lat_bins) object (-65.0, -64.0] (-64.0, -63.0] ... (63.0, 64.0]\n", - " * lon_bins (lon_bins) object (-180.0, -179.0] ... (178.0, 179.0]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "flox.xarray.xarray_reduce(\n", " da,\n", @@ -1235,11 +266,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -1249,15 +275,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/source/user-stories/overlaps.md b/docs/source/user-stories/overlaps.md index c854f4c2b..eebcd47a3 100644 --- a/docs/source/user-stories/overlaps.md +++ b/docs/source/user-stories/overlaps.md @@ -19,6 +19,7 @@ globally, as well as over the Atlantic, and the Indo-Pacific. Generally group-by groups. In this example, the "global" group overlaps with the "Indo-Pacific" and "Atlantic" groups. Below we consider a simplified version of this problem. Consider the following labels: + ```{code-cell} import numpy as np import xarray as xr @@ -34,10 +35,12 @@ labels ``` These labels are non-overlapping. So when we reduce this data array over those labels along `x` + ```{code-cell} da = xr.ones_like(labels) da ``` + we get (note the reduction over `x` is implicit here): ```{code-cell} @@ -47,6 +50,7 @@ xarray_reduce(da, labels, func="sum") Now let's _also_ calculate the `sum` where `labels` is either `1` or `2`. We could easily compute this using the grouped result but here we use this simple example for illustration. The trick is to add a new dimension with new labels (here `4`) in the appropriate locations. + ```{code-cell} # assign 4 where label == 1 or 2, and -1 otherwise newlabels = xr.where(labels.isin([1, 2]), 4, -1) @@ -59,6 +63,7 @@ expanded Now we reduce over `x` _and_ the new dimension `y` (again implicitly) to get the appropriate sum under `label=4` (and `label=-1`). We can discard the value accumulated under `label=-1` later. + ```{code-cell} xarray_reduce(da, expanded, func="sum") ``` @@ -66,6 +71,7 @@ xarray_reduce(da, expanded, func="sum") This way we compute all the reductions we need, in a single pass over the data. This technique generalizes to more complicated aggregations. The trick is to + - generate appropriate labels - concatenate these new labels along a new dimension (`y`) absent on the object being reduced (`da`), and - reduce over that new dimension in addition to any others. diff --git a/docs/source/xarray.md b/docs/source/xarray.md new file mode 100644 index 000000000..1877079cf --- /dev/null +++ b/docs/source/xarray.md @@ -0,0 +1,32 @@ +(xarray)= + +# Xarray + +Xarray will use flox by default (if installed) for DataArrays containing numpy and dask arrays. The default choice is `method="cohorts"` which generalizes +the best. Pass flox-specific kwargs to the specific reduction method: + +```python +ds.groupby("time.month").mean(method="map-reduce", engine="flox") +ds.groupby_bins("lon", bins=[0, 10, 20]).mean(method="map-reduce") +ds.resample(time="M").mean(method="blockwise") +``` + +Xarray's GroupBy operations are currently limited: + +1. One can only group by a single variable. +1. When grouping by a dask array, that array will be computed to discover the unique group labels, and their locations + +These limitations can be avoided by using {py:func}`flox.xarray.xarray_reduce` which allows grouping by multiple variables, lazy grouping by dask variables, +as well as an arbitrary combination of categorical grouping and binning. For example, + +```python +flox.xarray.xarray_reduce( + ds, + ds.time.dt.month, + ds.lon, + func="mean", + expected_groups=[None, [0, 10, 20]], + isbin=[False, True], + method="map-reduce", +) +``` diff --git a/flox/__init__.py b/flox/__init__.py index 5647a4ee0..cc44cbca0 100644 --- a/flox/__init__.py +++ b/flox/__init__.py @@ -5,15 +5,14 @@ from .aggregations import Aggregation # noqa from .core import groupby_reduce, rechunk_for_blockwise, rechunk_for_cohorts # noqa -try: - from importlib.metadata import version as _version -except ImportError: - # if the fallback library is missing, we are doomed. - from importlib_metadata import version as _version # type: ignore[no-redef] -try: - __version__ = _version("flox") -except Exception: - # Local copy or not installed with setuptools. - # Disable minimum version checks on downstream libraries. +def _get_version(): __version__ = "999" + try: + from ._version import __version__ + except ImportError: + pass + return __version__ + + +__version__ = _get_version() diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index 62a760653..4df3f77a4 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -77,7 +77,6 @@ def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs): def sum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): - return sum( group_idx, array**2, @@ -107,7 +106,8 @@ def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): if fill_value is None: fill_value = 0 out = sum(group_idx, array, axis=axis, size=size, dtype=dtype, fill_value=fill_value) - out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) + with np.errstate(invalid="ignore", divide="ignore"): + out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out @@ -115,5 +115,6 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None if fill_value is None: fill_value = 0 out = nansum(group_idx, array, size=size, axis=axis, dtype=dtype, fill_value=fill_value) - out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) + with np.errstate(invalid="ignore", divide="ignore"): + out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out diff --git a/flox/aggregate_npg.py b/flox/aggregate_npg.py index 8015f67b5..30e0eb257 100644 --- a/flox/aggregate_npg.py +++ b/flox/aggregate_npg.py @@ -9,14 +9,41 @@ def _get_aggregate(engine): def sum_of_squares( - group_idx, array, engine, *, axis=-1, func="sum", size=None, fill_value=None, dtype=None + group_idx, + array, + engine, + *, + axis=-1, + size=None, + fill_value=None, + dtype=None, ): + return _get_aggregate(engine).aggregate( + group_idx, + array, + axis=axis, + func="sumofsquares", + size=size, + fill_value=fill_value, + dtype=dtype, + ) + +def nansum_of_squares( + group_idx, + array, + engine, + *, + axis=-1, + size=None, + fill_value=None, + dtype=None, +): return _get_aggregate(engine).aggregate( group_idx, - array**2, + array, axis=axis, - func=func, + func="nansumofsquares", size=size, fill_value=fill_value, dtype=dtype, @@ -55,19 +82,6 @@ def nanprod(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dt ) -def nansum_of_squares(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None): - return sum_of_squares( - group_idx, - array, - engine=engine, - func="nansum", - size=size, - fill_value=fill_value, - axis=axis, - dtype=dtype, - ) - - def _len(group_idx, array, engine, *, func, axis=-1, size=None, fill_value=None, dtype=None): result = _get_aggregate(engine).aggregate( group_idx, diff --git a/flox/aggregations.py b/flox/aggregations.py index a9be99ba0..21ac9925b 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -1,12 +1,19 @@ from __future__ import annotations import copy +import warnings from functools import partial +from typing import TYPE_CHECKING, Any, Callable, TypedDict import numpy as np import numpy_groupies as npg +from numpy.typing import DTypeLike -from . import aggregate_flox, aggregate_npg, xrdtypes as dtypes, xrutils +from . import aggregate_flox, aggregate_npg, xrutils +from . import xrdtypes as dtypes + +if TYPE_CHECKING: + FuncTuple = tuple[Callable | str, ...] def _is_arg_reduction(func: str | Aggregation) -> bool: @@ -17,6 +24,17 @@ def _is_arg_reduction(func: str | Aggregation) -> bool: return False +class AggDtypeInit(TypedDict): + final: DTypeLike | None + intermediate: tuple[DTypeLike, ...] + + +class AggDtype(TypedDict): + final: np.dtype + numpy: tuple[np.dtype | type[np.intp], ...] + intermediate: tuple[np.dtype | type[np.intp], ...] + + def generic_aggregate( group_idx, array, @@ -48,7 +66,7 @@ def generic_aggregate( method_ = getattr(aggregate_npg, func) method = partial(method_, engine=engine) except AttributeError: - aggregate = npg.aggregate_np if engine == "numpy" else npg.aggregate_nb + aggregate = aggregate_npg._get_aggregate(engine).aggregate method = partial(aggregate, func=func) else: raise ValueError( @@ -57,12 +75,15 @@ def generic_aggregate( group_idx = np.asarray(group_idx, like=array) - return method( - group_idx, array, axis=axis, size=size, fill_value=fill_value, dtype=dtype, **kwargs - ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + result = method( + group_idx, array, axis=axis, size=size, fill_value=fill_value, dtype=dtype, **kwargs + ) + return result -def _normalize_dtype(dtype, array_dtype, fill_value=None): +def _normalize_dtype(dtype: DTypeLike, array_dtype: np.dtype, fill_value=None) -> np.dtype: if dtype is None: dtype = array_dtype if dtype is np.floating: @@ -108,16 +129,16 @@ def __init__( self, name, *, - numpy=None, - chunk, - combine, - preprocess=None, - aggregate=None, - finalize=None, + numpy: str | FuncTuple | None = None, + chunk: str | FuncTuple | None, + combine: str | FuncTuple | None, + preprocess: Callable | None = None, + aggregate: Callable | None = None, + finalize: Callable | None = None, fill_value=None, final_fill_value=dtypes.NA, dtypes=None, - final_dtype=None, + final_dtype: DTypeLike | None = None, reduction_type="reduce", ): """ @@ -167,15 +188,17 @@ def __init__( self.preprocess = preprocess # Use "chunk_reduce" or "chunk_argreduce" self.reduction_type = reduction_type - self.numpy = (numpy,) if numpy else (self.name,) + self.numpy: FuncTuple = (numpy,) if numpy else (self.name,) # initialize blockwise reduction - self.chunk = _atleast_1d(chunk) + self.chunk: FuncTuple = _atleast_1d(chunk) # how to aggregate results after first round of reduction - self.combine = _atleast_1d(combine) + self.combine: FuncTuple = _atleast_1d(combine) + # simpler reductions used with the "simple combine" algorithm + self.simple_combine: tuple[Callable, ...] = () # final aggregation - self.aggregate = aggregate if aggregate else self.combine[0] + self.aggregate: Callable | str = aggregate if aggregate else self.combine[0] # finalize results (see mean) - self.finalize = finalize if finalize else lambda x: x + self.finalize: Callable | None = finalize self.fill_value = {} # This is used for the final reindexing @@ -185,13 +208,15 @@ def __init__( # They should make sense when aggregated together with results from other blocks self.fill_value["intermediate"] = self._normalize_dtype_fill_value(fill_value, "fill_value") - self.dtype = {} - self.dtype[name] = final_dtype - self.dtype["intermediate"] = self._normalize_dtype_fill_value(dtypes, "dtype") + self.dtype_init: AggDtypeInit = { + "final": final_dtype, + "intermediate": self._normalize_dtype_fill_value(dtypes, "dtype"), + } + self.dtype: AggDtype = None # type: ignore # The following are set by _initialize_aggregation - self.finalize_kwargs = {} - self.min_count = None + self.finalize_kwargs: dict[Any, Any] = {} + self.min_count: int = 0 def _normalize_dtype_fill_value(self, value, name): value = _atleast_1d(value) @@ -216,15 +241,15 @@ def __dask_tokenize__(self): self.dtype, ) - def __repr__(self): + def __repr__(self) -> str: return "\n".join( ( - f"{self.name}, fill: {np.unique(self.fill_value.values())}, dtype: {self.dtype}", - f"chunk: {self.chunk}", - f"combine: {self.combine}", - f"aggregate: {self.aggregate}", - f"finalize: {self.finalize}", - f"min_count: {self.min_count}", + f"{self.name!r}, fill: {self.fill_value.values()!r}, dtype: {self.dtype}", + f"chunk: {self.chunk!r}", + f"combine: {self.combine!r}", + f"aggregate: {self.aggregate!r}", + f"finalize: {self.finalize!r}", + f"min_count: {self.min_count!r}", ) ) @@ -252,11 +277,18 @@ def __repr__(self): fill_value=1, final_fill_value=dtypes.NA, ) + + +def _mean_finalize(sum_, count): + with np.errstate(invalid="ignore", divide="ignore"): + return sum_ / count + + mean = Aggregation( "mean", chunk=("sum", "nanlen"), combine=("sum", "sum"), - finalize=lambda sum_, count: sum_ / count, + finalize=_mean_finalize, fill_value=(0, 0), dtypes=(None, np.intp), final_dtype=np.floating, @@ -265,7 +297,7 @@ def __repr__(self): "nanmean", chunk=("nansum", "nanlen"), combine=("sum", "sum"), - finalize=lambda sum_, count: sum_ / count, + finalize=_mean_finalize, fill_value=(0, 0), dtypes=(None, np.intp), final_dtype=np.floating, @@ -274,7 +306,8 @@ def __repr__(self): # TODO: fix this for complex numbers def _var_finalize(sumsq, sum_, count, ddof=0): - result = (sumsq - (sum_**2 / count)) / (count - ddof) + with np.errstate(invalid="ignore", divide="ignore"): + result = (sumsq - (sum_**2 / count)) / (count - ddof) result[count <= ddof] = np.nan return result @@ -361,6 +394,10 @@ def _zip_index(array_, idx_): ) +def _pick_second(*x): + return x[1] + + argmax = Aggregation( "argmax", preprocess=argreduce_preprocess, @@ -369,7 +406,7 @@ def _zip_index(array_, idx_): reduction_type="argreduce", fill_value=(dtypes.NINF, 0), final_fill_value=-1, - finalize=lambda *x: x[1], + finalize=_pick_second, dtypes=(None, np.intp), final_dtype=np.intp, ) @@ -382,7 +419,7 @@ def _zip_index(array_, idx_): reduction_type="argreduce", fill_value=(dtypes.INF, 0), final_fill_value=-1, - finalize=lambda *x: x[1], + finalize=_pick_second, dtypes=(None, np.intp), final_dtype=np.intp, ) @@ -393,9 +430,9 @@ def _zip_index(array_, idx_): chunk=("nanmax", "nanargmax"), # order is important combine=("max", "argmax"), reduction_type="argreduce", - fill_value=(dtypes.NINF, -1), + fill_value=(dtypes.NINF, 0), final_fill_value=-1, - finalize=lambda *x: x[1], + finalize=_pick_second, dtypes=(None, np.intp), final_dtype=np.intp, ) @@ -406,9 +443,9 @@ def _zip_index(array_, idx_): chunk=("nanmin", "nanargmin"), # order is important combine=("min", "argmin"), reduction_type="argreduce", - fill_value=(dtypes.INF, -1), + fill_value=(dtypes.INF, 0), final_fill_value=-1, - finalize=lambda *x: x[1], + finalize=_pick_second, dtypes=(None, np.intp), final_dtype=np.intp, ) @@ -476,8 +513,8 @@ def _initialize_aggregation( dtype, array_dtype, fill_value, - min_count: int | None, - finalize_kwargs, + min_count: int, + finalize_kwargs: dict[Any, Any] | None, ) -> Aggregation: if not isinstance(func, Aggregation): try: @@ -495,24 +532,30 @@ def _initialize_aggregation( # np.dtype(None) == np.dtype("float64")!!! # so check for not None - if dtype is not None and not isinstance(dtype, np.dtype): - dtype = np.dtype(dtype) + dtype_: np.dtype | None = ( + np.dtype(dtype) if dtype is not None and not isinstance(dtype, np.dtype) else dtype + ) - agg.dtype[func] = _normalize_dtype(dtype or agg.dtype[func], array_dtype, fill_value) - agg.dtype["numpy"] = (agg.dtype[func],) - agg.dtype["intermediate"] = [ - _normalize_dtype(int_dtype, np.result_type(array_dtype, agg.dtype[func]), int_fv) - if int_dtype is None - else int_dtype - for int_dtype, int_fv in zip(agg.dtype["intermediate"], agg.fill_value["intermediate"]) - ] + final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value) + agg.dtype = { + "final": final_dtype, + "numpy": (final_dtype,), + "intermediate": tuple( + _normalize_dtype(int_dtype, np.result_type(array_dtype, final_dtype), int_fv) + if int_dtype is None + else np.dtype(int_dtype) + for int_dtype, int_fv in zip( + agg.dtype_init["intermediate"], agg.fill_value["intermediate"] + ) + ), + } # Replace sentinel fill values according to dtype agg.fill_value["intermediate"] = tuple( _get_fill_value(dt, fv) for dt, fv in zip(agg.dtype["intermediate"], agg.fill_value["intermediate"]) ) - agg.fill_value[func] = _get_fill_value(agg.dtype[func], agg.fill_value[func]) + agg.fill_value[func] = _get_fill_value(agg.dtype["final"], agg.fill_value[func]) fv = fill_value if fill_value is not None else agg.fill_value[agg.name] if _is_arg_reduction(agg): @@ -530,7 +573,7 @@ def _initialize_aggregation( # absent in one block, but present in another block # We set it for numpy to get nansum, nanprod tests to pass # where the identity element is 0, 1 - if min_count is not None: + if min_count > 0: agg.min_count = min_count agg.chunk += ("nanlen",) agg.numpy += ("nanlen",) @@ -539,5 +582,19 @@ def _initialize_aggregation( agg.fill_value["numpy"] += (0,) agg.dtype["intermediate"] += (np.intp,) agg.dtype["numpy"] += (np.intp,) + else: + agg.min_count = 0 + + simple_combine: list[Callable] = [] + for combine in agg.combine: + if isinstance(combine, str): + if combine in ["nanfirst", "nanlast"]: + simple_combine.append(getattr(xrutils, combine)) + else: + simple_combine.append(getattr(np, combine)) + else: + simple_combine.append(combine) + + agg.simple_combine = tuple(simple_combine) return agg diff --git a/flox/core.py b/flox/core.py index 6bd390137..f8f700f99 100644 --- a/flox/core.py +++ b/flox/core.py @@ -4,10 +4,22 @@ import itertools import math import operator +import sys +import warnings from collections import namedtuple from functools import partial, reduce from numbers import Integral -from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Mapping, Sequence, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Literal, + Mapping, + Sequence, + Union, + overload, +) import numpy as np import numpy_groupies as npg @@ -26,9 +38,28 @@ from .xrutils import is_duck_array, is_duck_dask_array, isnull if TYPE_CHECKING: + try: + if sys.version_info < (3, 11): + from typing_extensions import Unpack + else: + from typing import Unpack + except (ModuleNotFoundError, ImportError): + Unpack: Any # type: ignore + import dask.array.Array as DaskArray - T_ExpectedGroups = Union[Sequence, np.ndarray, pd.Index] + T_DuckArray = Union[np.ndarray, DaskArray] # Any ? + T_By = T_DuckArray + T_Bys = tuple[T_By, ...] + T_ExpectIndex = Union[pd.Index] + T_ExpectIndexTuple = tuple[T_ExpectIndex, ...] + T_ExpectIndexOpt = Union[T_ExpectIndex, None] + T_ExpectIndexOptTuple = tuple[T_ExpectIndexOpt, ...] + T_Expect = Union[Sequence, np.ndarray, T_ExpectIndex] + T_ExpectTuple = tuple[T_Expect, ...] + T_ExpectOpt = Union[Sequence, np.ndarray, T_ExpectIndexOpt] + T_ExpectOptTuple = tuple[T_ExpectOpt, ...] + T_ExpectedGroups = Union[T_Expect, T_ExpectOptTuple] T_ExpectedGroupsOpt = Union[T_ExpectedGroups, None] T_Func = Union[str, Callable] T_Funcs = Union[T_Func, Sequence[T_Func]] @@ -39,8 +70,7 @@ T_Dtypes = Union[np.typing.DTypeLike, Sequence[np.typing.DTypeLike], None] T_FillValues = Union[np.typing.ArrayLike, Sequence[np.typing.ArrayLike], None] T_Engine = Literal["flox", "numpy", "numba"] - T_MethodCohorts = Literal["cohorts", "split-reduce"] - T_Method = Literal["map-reduce", "blockwise", T_MethodCohorts] + T_Method = Literal["map-reduce", "blockwise", "cohorts"] T_IsBins = Union[bool | Sequence[bool]] @@ -68,7 +98,11 @@ def _is_minmax_reduction(func: T_Agg) -> bool: ) -def _get_expected_groups(by, sort: bool) -> pd.Index: +def _is_first_last_reduction(func: T_Agg) -> bool: + return isinstance(func, str) and func in ["nanfirst", "nanlast", "first", "last"] + + +def _get_expected_groups(by: T_By, sort: bool) -> T_ExpectIndex: if is_duck_dask_array(by): raise ValueError("Please provide expected_groups if not grouping by a numpy array.") flatby = by.reshape(-1) @@ -76,7 +110,7 @@ def _get_expected_groups(by, sort: bool) -> pd.Index: return _convert_expected_groups_to_index((expected,), isbin=(False,), sort=sort)[0] -def _get_chunk_reduction(reduction_type: str) -> Callable: +def _get_chunk_reduction(reduction_type: Literal["reduce", "argreduce"]) -> Callable: if reduction_type == "reduce": return chunk_reduce elif reduction_type == "argreduce": @@ -120,7 +154,7 @@ def _get_optimal_chunks_for_groups(chunks, labels): firstidx = first_indexes[labels_at_chunk_bounds] newchunkidx = [0] - for c, f, l in zip(chunkidx, firstidx, lastidx): + for c, f, l in zip(chunkidx, firstidx, lastidx): # noqa Ξ”f = abs(c - f) Ξ”l = abs(c - l) if c == 0 or newchunkidx[-1] > l: @@ -137,14 +171,14 @@ def _get_optimal_chunks_for_groups(chunks, labels): return tuple(newchunks) -def _unique(a): +def _unique(a: np.ndarray) -> np.ndarray: """Much faster to use pandas unique and sort the results. np.unique sorts before uniquifying and is slow.""" - return np.sort(pd.unique(a)) + return np.sort(pd.unique(a.reshape(-1))) @memoize -def find_group_cohorts(labels, chunks, merge: bool = True): +def find_group_cohorts(labels, chunks, merge: bool = True) -> dict: """ Finds groups labels that occur together aka "cohorts" @@ -161,8 +195,6 @@ def find_group_cohorts(labels, chunks, merge: bool = True): merge : bool, optional Attempt to merge cohorts when one cohort's chunks are a subset of another cohort's chunks. - method : ["split-reduce", "cohorts"], optional - Which method are we using? Returns ------- @@ -179,6 +211,7 @@ def find_group_cohorts(labels, chunks, merge: bool = True): axis = range(-labels.ndim, 0) # Easier to create a dask array and use the .blocks property array = dask.array.ones(tuple(sum(c) for c in chunks), chunks=chunks) + labels = np.broadcast_to(labels, array.shape[-labels.ndim :]) # Iterate over each block and create a new block of same shape with "chunk number" shape = tuple(array.blocks.shape[ax] for ax in axis) @@ -190,8 +223,13 @@ def find_group_cohorts(labels, chunks, merge: bool = True): raveled = labels.reshape(-1) # these are chunks where a label is present label_chunks = pd.Series(which_chunk).groupby(raveled).unique() + # These invert the label_chunks mapping so we know which labels occur together. - chunks_cohorts = tlz.groupby(lambda x: tuple(label_chunks.get(x)), label_chunks.keys()) + def invert(x) -> tuple[np.ndarray, ...]: + arr = label_chunks.get(x) + return tuple(arr) # type: ignore [arg-type] # pandas issue? + + chunks_cohorts = tlz.groupby(invert, label_chunks.keys()) if merge: # First sort by number of chunks occupied by cohort @@ -231,14 +269,14 @@ def find_group_cohorts(labels, chunks, merge: bool = True): def rechunk_for_cohorts( - array, + array: DaskArray, axis: T_Axis, - labels, - force_new_chunk_at, - chunksize=None, - ignore_old_chunks=False, - debug=False, -): + labels: np.ndarray, + force_new_chunk_at: Sequence, + chunksize: int | None = None, + ignore_old_chunks: bool = False, + debug: bool = False, +) -> DaskArray: """ Rechunks array so that each new chunk contains groups that always occur together. @@ -257,7 +295,7 @@ def rechunk_for_cohorts( Labels at which we always start a new chunk. For the example ``labels`` array, this would be `1`. chunksize : int, optional - nominal chunk size. Chunk size is exceded when the label + nominal chunk size. Chunk size is exceeded when the label in ``force_new_chunk_at`` is less than ``chunksize//2`` elements away. If None, uses median chunksize along axis. @@ -326,10 +364,10 @@ def rechunk_for_cohorts( return array.rechunk({axis: newchunks}) -def rechunk_for_blockwise(array, axis: T_Axis, labels): +def rechunk_for_blockwise(array: DaskArray, axis: T_Axis, labels: np.ndarray) -> DaskArray: """ Rechunks array so that group boundaries line up with chunk boundaries, allowing - embarassingly parallel group reductions. + embarrassingly parallel group reductions. This only works when the groups are sequential (e.g. labels = ``[0,0,0,1,1,1,1,2,2]``). @@ -349,7 +387,7 @@ def rechunk_for_blockwise(array, axis: T_Axis, labels): DaskArray Rechunked array """ - labels = factorize_((labels,), axis=None)[0] + labels = factorize_((labels,), axes=())[0] chunks = array.chunks[axis] newchunks = _get_optimal_chunks_for_groups(chunks, labels) if newchunks == chunks: @@ -359,9 +397,13 @@ def rechunk_for_blockwise(array, axis: T_Axis, labels): def reindex_( - array: np.ndarray, from_, to, fill_value=None, axis: T_Axis = -1, promote: bool = False + array: np.ndarray, + from_, + to, + fill_value: Any = None, + axis: T_Axis = -1, + promote: bool = False, ) -> np.ndarray: - if not isinstance(to, pd.Index): if promote: to = pd.Index(to) @@ -388,7 +430,7 @@ def reindex_( ) idx = from_.get_indexer(to) indexer = [slice(None, None)] * array.ndim - indexer[axis] = idx # type: ignore + indexer[axis] = idx reindexed = array[tuple(indexer)] if any(idx == -1): if fill_value is None: @@ -416,18 +458,58 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]: ) # -1 indicates NaNs. preserve these otherwise we aggregate in the wrong groups! offset[labels == -1] = -1 - size: int = math.prod(labels.shape[:-1]) * ngroups # type: ignore + size: int = math.prod(labels.shape[:-1]) * ngroups return offset, size +@overload def factorize_( - by: tuple, - axis: T_AxesOpt, - expected_groups: tuple[pd.Index, ...] = None, + by: T_Bys, + axes: T_Axes, + *, + fastpath: Literal[True], + expected_groups: T_ExpectIndexOptTuple | None = None, + reindex: bool = False, + sort: bool = True, +) -> tuple[np.ndarray, tuple[np.ndarray, ...], tuple[int, ...], int, int, None]: + ... + + +@overload +def factorize_( + by: T_Bys, + axes: T_Axes, + *, + expected_groups: T_ExpectIndexOptTuple | None = None, + reindex: bool = False, + sort: bool = True, + fastpath: Literal[False] = False, +) -> tuple[np.ndarray, tuple[np.ndarray, ...], tuple[int, ...], int, int, FactorProps]: + ... + + +@overload +def factorize_( + by: T_Bys, + axes: T_Axes, + *, + expected_groups: T_ExpectIndexOptTuple | None = None, + reindex: bool = False, + sort: bool = True, + fastpath: bool = False, +) -> tuple[np.ndarray, tuple[np.ndarray, ...], tuple[int, ...], int, int, FactorProps | None]: + ... + + +def factorize_( + by: T_Bys, + axes: T_Axes, + *, + expected_groups: T_ExpectIndexOptTuple | None = None, reindex: bool = False, - sort=True, - fastpath=False, -): + sort: bool = True, + fastpath: bool = False, +) -> tuple[np.ndarray, tuple[np.ndarray, ...], tuple[int, ...], int, int, FactorProps | None]: """ Returns an array of integer codes for groups (and associated data) by wrapping pd.cut and pd.factorize (depending on isbin). @@ -435,9 +517,6 @@ def factorize_( a possibly large results array. Instead we set up the appropriate integer codes (group_idx) so that the results come out in the appropriate order. """ - if not isinstance(by, tuple): - raise ValueError(f"Expected `by` to be a tuple. Received {type(by)} instead") - if expected_groups is None: expected_groups = (None,) * len(by) @@ -446,26 +525,37 @@ def factorize_( for groupvar, expect in zip(by, expected_groups): flat = groupvar.reshape(-1) if isinstance(expect, pd.RangeIndex): - idx = flat + # idx is a view of the original `by` array + # copy here so we don't have a race condition with the + # group_idx[nanmask] = nan_sentinel assignment later + # this is important in shared-memory parallelism with dask + # TODO: figure out how to avoid this + idx = flat.copy() found_groups.append(np.array(expect)) # TODO: fix by using masked integers idx[idx > expect[-1]] = -1 elif isinstance(expect, pd.IntervalIndex): - # when binning we change expected groups to integers marking the interval - # this makes the reindexing logic simpler. - # workaround for https://github.com/pandas-dev/pandas/issues/47614 - # we create breaks and pass that to pd.cut, disallow closed="both" for now. if expect.closed == "both": raise NotImplementedError - if groupvar.dtype.kind == "M": - # pd.cut with bins = IntervalIndex[datetime64] doesn't work... - bins = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]]) + bins = np.concatenate([expect.left.to_numpy(), expect.right.to_numpy()[[-1]]]) + + # digitize is 0 or idx.max() for values outside the bounds of all intervals + # make it behave like pd.cut which uses -1: + if len(bins) > 1: + right = expect.closed_right + idx = np.digitize( + flat, + bins=bins.view(np.int64) if bins.dtype.kind == "M" else bins, + right=right, + ) + idx -= 1 + within_bins = flat <= bins.max() if right else flat < bins.max() + idx[~within_bins] = -1 else: - bins = np.concatenate([expect.left.to_numpy(), [expect.right[-1]]]) - # code is -1 for values outside the bounds of all intervals - idx = pd.cut(flat, bins=bins, right=expect.closed_right).codes.copy() - found_groups.append(expect) + idx = np.zeros_like(flat, dtype=np.intp) - 1 + + found_groups.append(np.array(expect)) else: if expect is not None and reindex: sorter = np.argsort(expect) @@ -479,10 +569,10 @@ def factorize_( idx = sorter[(idx,)] idx[mask] = -1 else: - idx, groups = pd.factorize(flat, sort=sort) + idx, groups = pd.factorize(flat, sort=sort) # type: ignore # pandas issue? found_groups.append(np.array(groups)) - factorized.append(idx) + factorized.append(idx.reshape(groupvar.shape)) grp_shape = tuple(len(grp) for grp in found_groups) ngroups = math.prod(grp_shape) @@ -492,20 +582,18 @@ def factorize_( # Restore these after the raveling nan_by_mask = reduce(np.logical_or, [(f == -1) for f in factorized]) group_idx[nan_by_mask] = -1 - group_idx = group_idx.reshape(by[0].shape) else: group_idx = factorized[0] if fastpath: - return group_idx.reshape(by[0].shape), found_groups, grp_shape + return group_idx, tuple(found_groups), grp_shape, ngroups, ngroups, None - if np.isscalar(axis) and groupvar.ndim > 1: + if len(axes) == 1 and groupvar.ndim > 1: # Not reducing along all dimensions of by # this is OK because for 3D by and axis=(1,2), # we collapse to a 2D by and axis=-1 offset_group = True group_idx, size = offset_labels(group_idx.reshape(by[0].shape), ngroups) - group_idx = group_idx.reshape(-1) else: size = ngroups offset_group = False @@ -522,7 +610,7 @@ def factorize_( group_idx[nanmask] = nan_sentinel props = FactorProps(offset_group, nan_sentinel, nanmask) - return group_idx, found_groups, grp_shape, ngroups, size, props + return group_idx, tuple(found_groups), grp_shape, ngroups, size, props def chunk_argreduce( @@ -644,35 +732,49 @@ def chunk_reduce( assert len(kwargss) >= nfuncs if isinstance(axis, Sequence): - nax = len(axis) - if nax == 1: - axis = axis[0] + axes: T_Axes = axis + nax = len(axes) else: nax = by.ndim + if axis is None: + axes = () + else: + axes = (axis,) * nax + + assert by.ndim <= array.ndim final_array_shape = array.shape[:-nax] + (1,) * (nax - 1) final_groups_shape = (1,) * (nax - 1) - # when axis is a tuple - # collapse and move reduction dimensions to the end - if isinstance(axis, Sequence) and len(axis) < by.ndim: - by = _collapse_axis(by, len(axis)) - array = _collapse_axis(array, len(axis)) - axis = -1 + if 1 < nax < by.ndim: + # when axis is a tuple + # collapse and move reduction dimensions to the end + by = _collapse_axis(by, nax) + array = _collapse_axis(array, nax) + axes = (-1,) + nax = 1 # if indices=[2,2,2], npg assumes groups are (0, 1, 2); # and will return a result that is bigger than necessary # avoid by factorizing again so indices=[2,2,2] is changed to # indices=[0,0,0]. This is necessary when combining block results # factorize can handle strings etc unlike digitize - group_idx, groups, found_groups_shape, _, size, props = factorize_( - (by,), axis, expected_groups=(expected_groups,), reindex=reindex, sort=sort + group_idx, grps, found_groups_shape, _, size, props = factorize_( + (by,), axes, expected_groups=(expected_groups,), reindex=reindex, sort=sort ) - groups = groups[0] + groups = grps[0] + if nax > 1: + needs_broadcast = any( + group_idx.shape[ax] != array.shape[ax] and group_idx.shape[ax] == 1 + for ax in range(-nax, 0) + ) + if needs_broadcast: + group_idx = np.broadcast_to(group_idx, array.shape[-by.ndim :]) # always reshape to 1D along group dimensions newshape = array.shape[: array.ndim - by.ndim] + (math.prod(array.shape[-by.ndim :]),) array = array.reshape(newshape) + group_idx = group_idx.reshape(-1) assert group_idx.ndim == 1 empty = np.all(props.nanmask) @@ -760,7 +862,8 @@ def _finalize_results( """ squeezed = _squeeze_results(results, axis) - if agg.min_count is not None: + min_count = agg.min_count + if min_count > 0: counts = squeezed["intermediates"][-1] squeezed["intermediates"] = squeezed["intermediates"][:-1] @@ -771,8 +874,8 @@ def _finalize_results( else: finalized[agg.name] = agg.finalize(*squeezed["intermediates"], **agg.finalize_kwargs) - if agg.min_count is not None: - count_mask = counts < agg.min_count + if min_count > 0: + count_mask = counts < min_count if count_mask.any(): # For one count_mask.any() prevents promoting bool to dtype(fill_value) unless # necessary @@ -793,7 +896,7 @@ def _finalize_results( else: finalized["groups"] = squeezed["groups"] - finalized[agg.name] = finalized[agg.name].astype(agg.dtype[agg.name], copy=False) + finalized[agg.name] = finalized[agg.name].astype(agg.dtype["final"], copy=False) return finalized @@ -819,8 +922,25 @@ def _expand_dims(results: IntermediateDict) -> IntermediateDict: return results +def _find_unique_groups(x_chunk) -> np.ndarray: + from dask.base import flatten + from dask.utils import deepmap + + unique_groups = _unique(np.asarray(tuple(flatten(deepmap(listify_groups, x_chunk))))) + unique_groups = unique_groups[~isnull(unique_groups)] + + if len(unique_groups) == 0: + unique_groups = np.array([np.nan]) + return unique_groups + + def _simple_combine( - x_chunk, agg: Aggregation, axis: T_Axes, keepdims: bool, is_aggregate: bool = False + x_chunk, + agg: Aggregation, + axis: T_Axes, + keepdims: bool, + reindex: bool, + is_aggregate: bool = False, ) -> IntermediateDict: """ 'Simple' combination of blockwise results. @@ -830,17 +950,31 @@ def _simple_combine( 2. _expand_dims was used to insert an extra axis DUMMY_AXIS 3. Here we concatenate along DUMMY_AXIS, and then call the combine function along DUMMY_AXIS - 4. At the final agggregate step, we squeeze out DUMMY_AXIS + 4. At the final aggregate step, we squeeze out DUMMY_AXIS """ from dask.array.core import deepfirst + from dask.utils import deepmap - results: IntermediateDict = {"groups": deepfirst(x_chunk)["groups"]} + if not reindex: + # We didn't reindex at the blockwise step + # So now reindex before combining by reducing along DUMMY_AXIS + unique_groups = _find_unique_groups(x_chunk) + x_chunk = deepmap( + partial(reindex_intermediates, agg=agg, unique_groups=unique_groups), x_chunk + ) + else: + unique_groups = deepfirst(x_chunk)["groups"] + + results: IntermediateDict = {"groups": unique_groups} results["intermediates"] = [] axis_ = axis[:-1] + (DUMMY_AXIS,) - for idx, combine in enumerate(agg.combine): + for idx, combine in enumerate(agg.simple_combine): array = _conc2(x_chunk, key1="intermediates", key2=idx, axis=axis_) assert array.ndim >= 2 - result = getattr(np, combine)(array, axis=axis_, keepdims=True) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + assert callable(combine) + result = combine(array, axis=axis_, keepdims=True) if is_aggregate: # squeeze out DUMMY_AXIS if this is the last step i.e. called from _aggregate result = result.squeeze(axis=DUMMY_AXIS) @@ -848,7 +982,7 @@ def _simple_combine( return results -def _conc2(x_chunk, key1, key2=slice(None), axis: T_Axes = None) -> np.ndarray: +def _conc2(x_chunk, key1, key2=slice(None), axis: T_Axes | None = None) -> np.ndarray: """copied from dask.array.reductions.mean_combine""" from dask.array.core import _concatenate2 from dask.utils import deepmap @@ -863,9 +997,9 @@ def _conc2(x_chunk, key1, key2=slice(None), axis: T_Axes = None) -> np.ndarray: # return concatenate3(mapped) -def reindex_intermediates(x, agg, unique_groups): +def reindex_intermediates(x: IntermediateDict, agg: Aggregation, unique_groups) -> IntermediateDict: new_shape = x["groups"].shape[:-1] + (len(unique_groups),) - newx = {"groups": np.broadcast_to(unique_groups, new_shape)} + newx: IntermediateDict = {"groups": np.broadcast_to(unique_groups, new_shape)} newx["intermediates"] = tuple( reindex_( v, from_=np.atleast_1d(x["groups"].squeeze()), to=pd.Index(unique_groups), fill_value=f @@ -875,7 +1009,7 @@ def reindex_intermediates(x, agg, unique_groups): return newx -def listify_groups(x): +def listify_groups(x: IntermediateDict): return list(np.atleast_1d(x["groups"].squeeze())) @@ -889,7 +1023,6 @@ def _grouped_combine( sort: bool = True, ) -> IntermediateDict: """Combine intermediates step of tree reduction.""" - from dask.base import flatten from dask.utils import deepmap if isinstance(x_chunk, dict): @@ -900,11 +1033,7 @@ def _grouped_combine( # when there's only a single axis of reduction, we can just concatenate later, # reindexing is unnecessary # I bet we can minimize the amount of reindexing for mD reductions too, but it's complicated - unique_groups = _unique(tuple(flatten(deepmap(listify_groups, x_chunk)))) - unique_groups = unique_groups[~isnull(unique_groups)] - if len(unique_groups) == 0: - unique_groups = [np.nan] - + unique_groups = _find_unique_groups(x_chunk) x_chunk = deepmap( partial(reindex_intermediates, agg=agg, unique_groups=unique_groups), x_chunk ) @@ -976,7 +1105,7 @@ def _grouped_combine( if array.shape[-1] == 0: # all empty when combined results["intermediates"].append( - np.empty(shape=(1,) * (len(axis) - 1) + (0,), dtype=agg.dtype) + np.empty(shape=(1,) * (len(axis) - 1) + (0,), dtype=dtype) ) results["groups"] = np.empty( shape=(1,) * (len(neg_axis) - 1) + (0,), dtype=groups.dtype @@ -1020,10 +1149,11 @@ def _reduce_blockwise( agg.finalize = None assert agg.finalize_kwargs is not None - finalize_kwargs = agg.finalize_kwargs - if isinstance(finalize_kwargs, Mapping): - finalize_kwargs = (finalize_kwargs,) - finalize_kwargs = finalize_kwargs + ({},) + ({},) + if isinstance(agg.finalize_kwargs, Mapping): + finalize_kwargs_: tuple[dict[Any, Any], ...] = (agg.finalize_kwargs,) + else: + finalize_kwargs_ = agg.finalize_kwargs + finalize_kwargs_ += ({},) + ({},) results = chunk_reduce( array, @@ -1036,7 +1166,7 @@ def _reduce_blockwise( # (see below) fill_value=agg.fill_value["numpy"], dtype=agg.dtype["numpy"], - kwargs=finalize_kwargs, + kwargs=finalize_kwargs_, engine=engine, sort=sort, reindex=reindex, @@ -1051,7 +1181,7 @@ def _reduce_blockwise( return result -def _normalize_indexes(array, flatblocks, blkshape): +def _normalize_indexes(array: DaskArray, flatblocks, blkshape) -> tuple: """ .blocks accessor can only accept one iterable at a time, but can handle multiple slices. @@ -1063,7 +1193,7 @@ def _normalize_indexes(array, flatblocks, blkshape): """ unraveled = np.unravel_index(flatblocks, blkshape) - normalized: list[Union[int, np.ndarray, slice]] = [] + normalized: list[int | slice | list[int]] = [] for ax, idx in enumerate(unraveled): i = _unique(idx).squeeze() if i.ndim == 0: @@ -1135,7 +1265,7 @@ def subset_to_blocks( return dask.array.Array(graph, name, chunks, meta=array) -def _extract_unknown_groups(reduced, group_chunks, dtype) -> tuple[DaskArray]: +def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]: import dask.array from dask.highlevelgraph import HighLevelGraph @@ -1151,7 +1281,7 @@ def _extract_unknown_groups(reduced, group_chunks, dtype) -> tuple[DaskArray]: dask.array.Array( HighLevelGraph.from_collections(groups_token, layer, dependencies=[reduced]), groups_token, - chunks=group_chunks, + chunks=((np.nan,),), meta=np.array([], dtype=dtype), ), ) @@ -1161,9 +1291,9 @@ def _extract_unknown_groups(reduced, group_chunks, dtype) -> tuple[DaskArray]: def dask_groupby_agg( array: DaskArray, - by: DaskArray | np.ndarray, + by: T_By, agg: Aggregation, - expected_groups: pd.Index | None, + expected_groups: T_ExpectIndexOpt, axis: T_Axes = (), fill_value: Any = None, method: T_Method = "map-reduce", @@ -1172,10 +1302,8 @@ def dask_groupby_agg( sort: bool = True, chunks_cohorts=None, ) -> tuple[DaskArray, tuple[np.ndarray | DaskArray]]: - import dask.array from dask.array.core import slices_from_chunks - from dask.highlevelgraph import HighLevelGraph # I think _tree_reduce expects this assert isinstance(axis, Sequence) @@ -1200,11 +1328,16 @@ def dask_groupby_agg( # chunk numpy arrays like the input array # This removes an extra rechunk-merge layer that would be # added otherwise - by = dask.array.from_array(by, chunks=tuple(array.chunks[ax] for ax in range(-by.ndim, 0))) + chunks = tuple(array.chunks[ax] if by.shape[ax] != 1 else (1,) for ax in range(-by.ndim, 0)) + + by = dask.array.from_array(by, chunks=chunks) _, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim :]) - # preprocess the array: for argreductions, this zips the index together with the array block - if agg.preprocess: + # preprocess the array: + # - for argreductions, this zips the index together with the array block + # - not necessary for blockwise with argreductions + # - if this is needed later, we can fix this then + if agg.preprocess and method != "blockwise": array = agg.preprocess(array, axis=axis) # 1. We first apply the groupby-reduction blockwise to generate "intermediates" @@ -1219,7 +1352,8 @@ def dask_groupby_agg( # This allows us to discover groups at compute time, support argreductions, lower intermediate # memory usage (but method="cohorts" would also work to reduce memory in some cases) - do_simple_combine = method != "blockwise" and reindex and not _is_arg_reduction(agg) + do_simple_combine = not _is_arg_reduction(agg) + if method == "blockwise": # use the "non dask" code path, but applied blockwise blockwise_method = partial( @@ -1243,10 +1377,13 @@ def dask_groupby_agg( partial( blockwise_method, axis=axis, - expected_groups=None if method in ["split-reduce", "cohorts"] else expected_groups, + expected_groups=None if method == "cohorts" else expected_groups, engine=engine, sort=sort, ), + # output indices are the same as input indices + # Unlike xhistogram, we don't always know what the size of the group + # dimension will be unless reindex=True inds, array, inds, @@ -1256,83 +1393,76 @@ def dask_groupby_agg( dtype=array.dtype, # this is purely for show meta=array._meta, align_arrays=False, - token=f"{name}-chunk-{token}", + name=f"{name}-chunk-{token}", ) - if expected_groups is None: - if is_duck_dask_array(by_input): - expected_groups = None - else: - expected_groups = _get_expected_groups(by_input, sort=sort) - group_chunks: tuple[tuple[Union[int, float], ...]] = ( - (len(expected_groups),) if expected_groups is not None else (np.nan,), - ) + group_chunks: tuple[tuple[int | float, ...]] - if method in ["map-reduce", "cohorts", "split-reduce"]: + if method in ["map-reduce", "cohorts"]: combine: Callable[..., IntermediateDict] if do_simple_combine: - combine = _simple_combine + combine = partial(_simple_combine, reindex=reindex) + combine_name = "simple-combine" else: combine = partial(_grouped_combine, engine=engine, sort=sort) + combine_name = "grouped-combine" - # Each chunk of `reduced`` is really a dict mapping - # 1. reduction name to array - # 2. "groups" to an array of group labels - # Note: it does not make sense to interpret axis relative to - # shape of intermediate results after the blockwise call tree_reduce = partial( dask.array.reductions._tree_reduce, - combine=partial(combine, agg=agg), - name=f"{name}-reduce-{method}", + name=f"{name}-reduce-{method}-{combine_name}", dtype=array.dtype, axis=axis, keepdims=True, concatenate=False, ) - aggregate = partial( - _aggregate, combine=combine, agg=agg, fill_value=fill_value, reindex=reindex - ) + aggregate = partial(_aggregate, combine=combine, agg=agg, fill_value=fill_value) + + # Each chunk of `reduced`` is really a dict mapping + # 1. reduction name to array + # 2. "groups" to an array of group labels + # Note: it does not make sense to interpret axis relative to + # shape of intermediate results after the blockwise call if method == "map-reduce": reduced = tree_reduce( intermediate, - aggregate=partial(aggregate, expected_groups=expected_groups), + combine=partial(combine, agg=agg), + aggregate=partial(aggregate, expected_groups=expected_groups, reindex=reindex), ) if is_duck_dask_array(by_input) and expected_groups is None: - groups = _extract_unknown_groups(reduced, group_chunks=group_chunks, dtype=by.dtype) + groups = _extract_unknown_groups(reduced, dtype=by.dtype) + group_chunks = ((np.nan,),) else: if expected_groups is None: expected_groups_ = _get_expected_groups(by_input, sort=sort) else: expected_groups_ = expected_groups groups = (expected_groups_.to_numpy(),) + group_chunks = ((len(expected_groups_),),) - elif method in ["cohorts", "split-reduce"]: + elif method == "cohorts": chunks_cohorts = find_group_cohorts( by_input, [array.chunks[ax] for ax in axis], merge=True ) reduced_ = [] groups_ = [] for blks, cohort in chunks_cohorts.items(): + index = pd.Index(cohort) subset = subset_to_blocks(intermediate, blks, array.blocks.shape[-len(axis) :]) - if do_simple_combine: - # reindex so that reindex can be set to True later - reindexed = dask.array.map_blocks( - reindex_intermediates, - subset, - agg=agg, - unique_groups=cohort, - meta=subset._meta, - ) - else: - reindexed = subset - + reindexed = dask.array.map_blocks( + reindex_intermediates, subset, agg=agg, unique_groups=index, meta=subset._meta + ) + # now that we have reindexed, we can set reindex=True explicitlly reduced_.append( tree_reduce( reindexed, - aggregate=partial(aggregate, expected_groups=cohort, reindex=reindex), + combine=partial(combine, agg=agg, reindex=True), + aggregate=partial(aggregate, expected_groups=index, reindex=True), ) ) - groups_.append(cohort) + # This is done because pandas promotes to 64-bit types when an Index is created + # So we use the index to generate the return value for consistency with "map-reduce" + # This is important on windows + groups_.append(index.values) reduced = dask.array.concatenate(reduced_, axis=-1) groups = (np.concatenate(groups_),) @@ -1344,95 +1474,152 @@ def dask_groupby_agg( # find number of groups in each chunk, this is needed for output chunks # along the reduced axis slices = slices_from_chunks(tuple(array.chunks[ax] for ax in axis)) - if expected_groups is None: - groups_in_block = tuple(_unique(by_input[slc]) for slc in slices) - else: - # For cohorts, we could be indexing a block with groups that - # are not in the cohort (usually for nD `by`) - # Only keep the expected groups. - groups_in_block = tuple( - np.intersect1d(by_input[slc], expected_groups) for slc in slices - ) + groups_in_block = tuple(_unique(by_input[slc]) for slc in slices) groups = (np.concatenate(groups_in_block),) - ngroups_per_block = tuple(len(grp) for grp in groups_in_block) group_chunks = (ngroups_per_block,) - else: raise ValueError(f"Unknown method={method}.") - # extract results from the dict + out_inds = inds[: -len(axis)] + (inds[-1],) output_chunks = reduced.chunks[: -len(axis)] + group_chunks + if method == "blockwise" and len(axis) > 1: + # The final results are available but the blocks along axes + # need to be reshaped to axis=-1 + # I don't know that this is possible with blockwise + # All other code paths benefit from an unmaterialized Blockwise layer + reduced = _collapse_blocks_along_axes(reduced, axis, group_chunks) + + # Can't use map_blocks because it forces concatenate=True along drop_axes, + result = dask.array.blockwise( + _extract_result, + out_inds, + reduced, + inds, + adjust_chunks=dict(zip(out_inds, output_chunks)), + dtype=agg.dtype["final"], + key=agg.name, + name=f"{name}-{token}", + concatenate=False, + ) + + return (result, groups) + + +def _collapse_blocks_along_axes(reduced: DaskArray, axis: T_Axes, group_chunks) -> DaskArray: + import dask.array + from dask.highlevelgraph import HighLevelGraph + + nblocks = tuple(reduced.numblocks[ax] for ax in axis) + output_chunks = reduced.chunks[: -len(axis)] + ((1,) * (len(axis) - 1),) + group_chunks + + # extract results from the dict ochunks = tuple(range(len(chunks_v)) for chunks_v in output_chunks) layer2: dict[tuple, tuple] = {} - agg_name = f"{name}-{token}" - for ochunk in itertools.product(*ochunks): - if method == "blockwise": - if len(axis) == 1: - inchunk = ochunk - else: - nblocks = tuple(len(array.chunks[ax]) for ax in axis) - inchunk = ochunk[:-1] + np.unravel_index(ochunk[-1], nblocks) - else: - inchunk = ochunk[:-1] + (0,) * (len(axis) - 1) + (ochunk[-1],) + name = f"reshape-{reduced.name}" - layer2[(agg_name, *ochunk)] = (operator.getitem, (reduced.name, *inchunk), agg.name) + for ochunk in itertools.product(*ochunks): + inchunk = ochunk[: -len(axis)] + np.unravel_index(ochunk[-1], nblocks) + layer2[(name, *ochunk)] = (reduced.name, *inchunk) - result = dask.array.Array( - HighLevelGraph.from_collections(agg_name, layer2, dependencies=[reduced]), - agg_name, + return dask.array.Array( + HighLevelGraph.from_collections(name, layer2, dependencies=[reduced]), + name, chunks=output_chunks, - dtype=agg.dtype[agg.name], + dtype=reduced.dtype, ) - return (result, groups) + +def _extract_result(result_dict: FinalResultsDict, key) -> np.ndarray: + from dask.array.core import deepfirst + + # deepfirst should be not be needed here but sometimes we receive a list of dict? + return deepfirst(result_dict)[key] -def _validate_reindex(reindex: bool | None, func, method: T_Method, expected_groups) -> bool | None: - if reindex is True: +def _validate_reindex( + reindex: bool | None, + func, + method: T_Method, + expected_groups, + any_by_dask: bool, + is_dask_array: bool, +) -> bool: + all_numpy = not is_dask_array and not any_by_dask + if reindex is True and not all_numpy: if _is_arg_reduction(func): raise NotImplementedError - if method == "blockwise": - raise NotImplementedError + if method in ["blockwise", "cohorts"]: + raise ValueError( + "reindex=True is not a valid choice for method='blockwise' or method='cohorts'." + ) + if func in ["first", "last"]: + raise ValueError("reindex must be None or False when func is 'first' or 'last.") - if method == "blockwise" or _is_arg_reduction(func): - reindex = False + if reindex is None: + if all_numpy: + return True - if reindex is None and expected_groups is not None: - reindex = True + if func in ["first", "last"]: + # have to do the grouped_combine since there's no good fill_value + reindex = False - if method in ["split-reduce", "cohorts"] and reindex is False: - raise NotImplementedError + if method == "blockwise" or _is_arg_reduction(func): + reindex = False + + elif method == "cohorts": + reindex = False - if method in ["split-reduce", "cohorts"] and reindex is None: - reindex = True + elif method == "map-reduce": + if expected_groups is None and any_by_dask: + reindex = False + else: + reindex = True + + assert isinstance(reindex, bool) - # TODO: Should reindex be a bool-only at this point? Would've been nice but - # None's are relied on after this function as well. return reindex -def _assert_by_is_aligned(shape, by): +def _assert_by_is_aligned(shape: tuple[int, ...], by: T_Bys) -> None: + assert all(b.ndim == by[0].ndim for b in by[1:]) for idx, b in enumerate(by): - if shape[-b.ndim :] != b.shape: + if not all(j in [i, 1] for i, j in zip(shape[-b.ndim :], b.shape)): raise ValueError( - "`array` and `by` arrays must be aligned " - "i.e. array.shape[-by.ndim :] == by.shape. " - "for every array in `by`." + "`array` and `by` arrays must be 'aligned' " + "so that such that by_ is broadcastable to array.shape[-by.ndim:] " + "for every array `by_` in `by`. " + "Either array.shape[-by_.ndim :] == by_.shape or the only differences " + "should be size-1 dimensions in by_." f"Received array of shape {shape} but " f"array {idx} in `by` has shape {b.shape}." ) +@overload +def _convert_expected_groups_to_index( + expected_groups: tuple[None, ...], isbin: Sequence[bool], sort: bool +) -> tuple[None, ...]: + ... + + +@overload +def _convert_expected_groups_to_index( + expected_groups: T_ExpectTuple, isbin: Sequence[bool], sort: bool +) -> T_ExpectIndexTuple: + ... + + def _convert_expected_groups_to_index( - expected_groups: T_ExpectedGroups, isbin: Sequence[bool], sort: bool -) -> tuple[pd.Index | None, ...]: - out: list[pd.Index | None] = [] + expected_groups: T_ExpectOptTuple, isbin: Sequence[bool], sort: bool +) -> T_ExpectIndexOptTuple: + out: list[T_ExpectIndexOpt] = [] for ex, isbin_ in zip(expected_groups, isbin): - if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin): + if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin_): if sort: - ex = ex.sort_values() - out.append(ex) + out.append(ex.sort_values()) + else: + out.append(ex) elif ex is not None: if isbin_: out.append(pd.IntervalIndex.from_breaks(ex)) @@ -1446,47 +1633,119 @@ def _convert_expected_groups_to_index( return tuple(out) -def _lazy_factorize_wrapper(*by, **kwargs): +def _lazy_factorize_wrapper(*by: T_By, **kwargs) -> np.ndarray: group_idx, *rest = factorize_(by, **kwargs) return group_idx -def _factorize_multiple(by, expected_groups, by_is_dask, reindex): - kwargs = dict( - expected_groups=expected_groups, - axis=None, # always None, we offset later if necessary. - fastpath=True, - reindex=reindex, - ) - if by_is_dask: +def _factorize_multiple( + by: T_Bys, + expected_groups: T_ExpectIndexOptTuple, + any_by_dask: bool, + reindex: bool, + sort: bool = True, +) -> tuple[tuple[np.ndarray], tuple[np.ndarray, ...], tuple[int, ...]]: + if any_by_dask: import dask.array + # unifying chunks will make sure all arrays in `by` are dask arrays + # with compatible chunks, even if there was originally a numpy array + inds = tuple(range(by[0].ndim)) + chunks, by_ = dask.array.unify_chunks(*itertools.chain(*zip(by, (inds,) * len(by)))) + group_idx = dask.array.map_blocks( _lazy_factorize_wrapper, - *np.broadcast_arrays(*by), + *by_, + chunks=tuple(chunks.values()), meta=np.array((), dtype=np.int64), - **kwargs, - ) - found_groups = tuple( - None if is_duck_dask_array(b) else pd.unique(b.reshape(-1)) for b in by + axes=(), # always (), we offset later if necessary. + expected_groups=expected_groups, + fastpath=True, + reindex=reindex, + sort=sort, ) - grp_shape = tuple(len(e) for e in expected_groups) + + fg, gs = [], [] + for by_, expect in zip(by, expected_groups): + if expect is None: + if is_duck_dask_array(by_): + raise ValueError( + "Please provide expected_groups when grouping by a dask array." + ) + + found_group = pd.unique(by_.reshape(-1)) + else: + found_group = expect.to_numpy() + + fg.append(found_group) + gs.append(len(found_group)) + + found_groups = tuple(fg) + grp_shape = tuple(gs) else: - group_idx, found_groups, grp_shape = factorize_(by, **kwargs) + group_idx, found_groups, grp_shape, ngroups, size, props = factorize_( + by, + axes=(), # always (), we offset later if necessary. + expected_groups=expected_groups, + fastpath=True, + reindex=reindex, + sort=sort, + ) - final_groups = tuple( - found if expect is None else expect.to_numpy() - for found, expect in zip(found_groups, expected_groups) - ) + return (group_idx,), found_groups, grp_shape + + +@overload +def _validate_expected_groups(nby: int, expected_groups: None) -> tuple[None, ...]: + ... + + +@overload +def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroups) -> T_ExpectTuple: + ... + + +def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroupsOpt) -> T_ExpectOptTuple: + if expected_groups is None: + return (None,) * nby + + if nby == 1 and not isinstance(expected_groups, tuple): + if isinstance(expected_groups, (pd.Index, np.ndarray)): + return (expected_groups,) + else: + array = np.asarray(expected_groups) + if np.issubdtype(array.dtype, np.integer): + # preserve default dtypes + # on pandas 1.5/2, on windows + # when a list is passed + array = array.astype(np.int64) + return (array,) + + if nby > 1 and not isinstance(expected_groups, tuple): # TODO: test for list + raise ValueError( + "When grouping by multiple variables, expected_groups must be a tuple " + "of either arrays or objects convertible to an array (like lists). " + "For example `expected_groups=(np.array([1, 2, 3]), ['a', 'b', 'c'])`." + f"Received a {type(expected_groups).__name__} instead. " + "When grouping by a single variable, you can pass an array or something " + "convertible to an array for convenience: `expected_groups=['a', 'b', 'c']`." + ) + + if TYPE_CHECKING: + assert isinstance(expected_groups, tuple) + + if len(expected_groups) != nby: + raise ValueError( + f"Must have same number of `expected_groups` (received {len(expected_groups)}) " + f" and variables to group by (received {nby})." + ) - if any(grp is None for grp in final_groups): - raise ValueError("Please provide expected_groups when grouping by a dask array.") - return (group_idx,), final_groups, grp_shape + return expected_groups def groupby_reduce( array: np.ndarray | DaskArray, - *by: np.ndarray | DaskArray, + *by: T_By, func: T_Agg, expected_groups: T_ExpectedGroupsOpt = None, sort: bool = True, @@ -1498,8 +1757,8 @@ def groupby_reduce( method: T_Method = "map-reduce", engine: T_Engine = "numpy", reindex: bool | None = None, - finalize_kwargs: Mapping | None = None, -) -> tuple[DaskArray, np.ndarray | DaskArray]: + finalize_kwargs: dict[Any, Any] | None = None, +) -> tuple[DaskArray, Unpack[tuple[np.ndarray | DaskArray, ...]]]: # type: ignore[misc] # Unpack not in mypy yet """ GroupBy reductions using tree reductions for dask.array @@ -1507,7 +1766,7 @@ def groupby_reduce( ---------- array : ndarray or DaskArray Array to be reduced, possibly nD - by : ndarray or DaskArray + *by : ndarray or DaskArray Array of labels to group over. Must be aligned with ``array`` so that ``array.shape[-by.ndim :] == by.shape`` func : str or Aggregation @@ -1526,7 +1785,7 @@ def groupby_reduce( Negative integers are normalized using array.ndim fill_value : Any Value to assign when a label in ``expected_groups`` is not present. - dtype: data-type , optional + dtype : data-type , optional DType for the output. Can be anything that is accepted by ``np.dtype``. min_count : int, default: None The required number of valid values to perform the operation. If @@ -1572,11 +1831,11 @@ def groupby_reduce( * ``"numba"``: Use the implementations in ``numpy_groupies.aggregate_numba``. reindex : bool, optional - Whether to "reindex" the blockwise results to `expected_groups` (possibly automatically detected). + Whether to "reindex" the blockwise results to ``expected_groups`` (possibly automatically detected). If True, the intermediate result of the blockwise groupby-reduction has a value for all expected groups, and the final result is a simple reduction of those intermediates. In nearly all cases, this is a significant boost in computation speed. For cases like time grouping, this may result in large intermediates relative to the - original block size. Avoid that by using method="cohorts". By default, it is turned off for argreductions. + original block size. Avoid that by using ``method="cohorts"``. By default, it is turned off for argreductions. finalize_kwargs : dict, optional Kwargs passed to finalize the reduction such as ``ddof`` for var, std. @@ -1597,15 +1856,22 @@ def groupby_reduce( "argreductions not supported for engine='flox' yet." "Try engine='numpy' or engine='numba' instead." ) - reindex = _validate_reindex(reindex, func, method, expected_groups) - bys = tuple(np.asarray(b) if not is_duck_array(b) else b for b in by) + bys: T_Bys = tuple(np.asarray(b) if not is_duck_array(b) else b for b in by) nby = len(bys) - by_is_dask = any(is_duck_dask_array(b) for b in bys) + by_is_dask = tuple(is_duck_dask_array(b) for b in bys) + any_by_dask = any(by_is_dask) - if method in ["split-reduce", "cohorts"] and by_is_dask: + if method in ["split-reduce", "cohorts"] and any_by_dask: raise ValueError(f"method={method!r} can only be used when grouping by numpy arrays.") + if method == "split-reduce": + method = "cohorts" + + reindex = _validate_reindex( + reindex, func, method, expected_groups, any_by_dask, is_duck_dask_array(array) + ) + if not is_duck_array(array): array = np.asarray(array) is_bool_array = np.issubdtype(array.dtype, bool) @@ -1615,36 +1881,43 @@ def groupby_reduce( isbins = isbin else: isbins = (isbin,) * nby - if expected_groups is None: - expected_groups = (None,) * nby _assert_by_is_aligned(array.shape, bys) - if nby == 1 and not isinstance(expected_groups, tuple): - expected_groups = (np.asarray(expected_groups),) - elif len(expected_groups) != nby: - raise ValueError( - f"Must have same number of `expected_groups` (received {len(expected_groups)}) " - f" and variables to group by (received {nby})." - ) + expected_groups = _validate_expected_groups(nby, expected_groups) + + for idx, (expect, is_dask) in enumerate(zip(expected_groups, by_is_dask)): + if is_dask and (reindex or nby > 1) and expect is None: + raise ValueError( + f"`expected_groups` for array {idx} in `by` cannot be None since it is a dask.array." + ) # We convert to pd.Index since that lets us know if we are binning or not # (pd.IntervalIndex or not) expected_groups = _convert_expected_groups_to_index(expected_groups, isbins, sort) - # TODO: could restrict this to dask-only - factorize_early = (nby > 1) or ( - any(isbins) and method in ["split-reduce", "cohorts"] and is_duck_dask_array(array) + # Don't factorize "early only when + # grouping by dask arrays, and not having expected_groups + factorize_early = not ( + # can't do it if we are grouping by dask array but don't have expected_groups + any(is_dask and ex_ is None for is_dask, ex_ in zip(by_is_dask, expected_groups)) ) if factorize_early: bys, final_groups, grp_shape = _factorize_multiple( - bys, expected_groups, by_is_dask=by_is_dask, reindex=reindex + bys, + expected_groups, + any_by_dask=any_by_dask, + # This is the only way it makes sense I think. + # reindex controls what's actually allocated in chunk_reduce + # At this point, we care about an accurate conversion to codes. + reindex=True, + sort=sort, ) expected_groups = (pd.RangeIndex(math.prod(grp_shape)),) assert len(bys) == 1 - by_ = bys[0] - expected_groups = expected_groups[0] + (by_,) = bys + (expected_groups,) = expected_groups if axis is None: axis_ = tuple(array.ndim + np.arange(-by_.ndim, 0)) @@ -1653,15 +1926,24 @@ def groupby_reduce( axis_ = np.core.numeric.normalize_axis_tuple(axis, array.ndim) # type: ignore nax = len(axis_) - if method in ["blockwise", "cohorts", "split-reduce"] and nax != by_.ndim: - raise NotImplementedError( - "Must reduce along all dimensions of `by` when method != 'map-reduce'." - f"Received method={method!r}" - ) + has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_) + + if _is_first_last_reduction(func): + if has_dask and nax != 1: + raise ValueError( + "For dask arrays: first, last, nanfirst, nanlast reductions are " + "only supported along a single axis. Please reshape appropriately." + ) + + elif nax not in [1, by_.ndim]: + raise ValueError( + "first, last, nanfirst, nanlast reductions are only supported " + "along a single axis or when reducing across all dimensions of `by`." + ) # TODO: make sure expected_groups is unique if nax == 1 and by_.ndim > 1 and expected_groups is None: - if not by_is_dask: + if not any_by_dask: expected_groups = _get_expected_groups(by_, sort) else: # When we reduce along all axes, we are guaranteed to see all @@ -1682,8 +1964,6 @@ def groupby_reduce( axis_ = tuple(array.ndim + np.arange(-nax, 0)) nax = len(axis_) - has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_) - # When axis is a subset of possible values; then npg will # apply it to groups that don't exist along a particular axis (for e.g.) # since these count as a group that is absent. thoo! @@ -1692,17 +1972,22 @@ def groupby_reduce( # Consider np.sum([np.nan]) = np.nan, np.nansum([np.nan]) = 0 if min_count is None: if nax < by_.ndim or fill_value is not None: - min_count = 1 + min_count_: int = 1 + else: + min_count_ = 0 + else: + min_count_ = min_count # TODO: set in xarray? - if min_count is not None and func in ["nansum", "nanprod"] and fill_value is None: + if min_count_ > 0 and func in ["nansum", "nanprod"] and fill_value is None: # nansum, nanprod have fill_value=0, 1 # overwrite than when min_count is set fill_value = np.nan kwargs = dict(axis=axis_, fill_value=fill_value, engine=engine) - agg = _initialize_aggregation(func, dtype, array.dtype, fill_value, min_count, finalize_kwargs) + agg = _initialize_aggregation(func, dtype, array.dtype, fill_value, min_count_, finalize_kwargs) + groups: tuple[np.ndarray | DaskArray, ...] if not has_dask: results = _reduce_blockwise( array, by_, agg, expected_groups=expected_groups, reindex=reindex, sort=sort, **kwargs @@ -1721,6 +2006,12 @@ def groupby_reduce( f"\n\n Received: {func}" ) + if method in ["blockwise", "cohorts"] and nax != by_.ndim: + raise NotImplementedError( + "Must reduce along all dimensions of `by` when method != 'map-reduce'." + f"Received method={method!r}" + ) + # TODO: just do this in dask_groupby_agg # we always need some fill_value (see above) so choose the default if needed if kwargs["fill_value"] is None: @@ -1745,7 +2036,7 @@ def groupby_reduce( assert len(groups) == 1 sorted_idx = np.argsort(groups[0]) # This optimization helps specifically with resampling - if not (sorted_idx[1:] <= sorted_idx[:-1]).all(): + if not (sorted_idx[:-1] <= sorted_idx[1:]).all(): result = result[..., sorted_idx] groups = (groups[0][sorted_idx],) @@ -1758,6 +2049,6 @@ def groupby_reduce( ).reshape(result.shape[:-1] + grp_shape) groups = final_groups - if _is_minmax_reduction(func) and is_bool_array: + if is_bool_array and (_is_minmax_reduction(func) or _is_first_last_reduction(func)): result = result.astype(bool) - return (result, *groups) + return (result, *groups) # type: ignore[return-value] # Unpack not in mypy yet diff --git a/flox/visualize.py b/flox/visualize.py index fd712fd4b..7d44c7d91 100644 --- a/flox/visualize.py +++ b/flox/visualize.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from .core import find_group_cohorts +from .core import _unique, find_group_cohorts def draw_mesh( @@ -21,12 +21,12 @@ def draw_mesh( colors=None, randomize=True, x0=0, + y0=0, append=False, ): - dx = 2 xpts = x0 + np.arange(0, (ncol + nspaces) * dx, dx) - ypts = np.arange(0, nrow * dx, dx) + ypts = y0 + np.arange(0, nrow * dx, dx) if colors is None: colors = mpl.cm.Set2.colors[:4] @@ -39,6 +39,7 @@ def draw_mesh( ax.set_aspect(1) ax.set_axis_off() + # ncolors = len(colors) if not randomize: colors = iter(colors) @@ -55,7 +56,7 @@ def draw_mesh( counter[fcolor] += 1 ax.add_patch( mpl.patches.Rectangle( - (x, y - 0.5 * dx), + (x, y), dx, dx, edgecolor="w", @@ -66,14 +67,15 @@ def draw_mesh( if draw_line_at is not None and icolor > 0 and icolor % draw_line_at == 0: plt.plot([x, x], [y - 0.75 * dx, y + 0.75 * dx], color="k", lw=2) - ax.set_xlim((0, max(xpts) + dx)) - ax.set_ylim((-0.75 * dx, max(ypts) + 0.75 * dx)) + # assert n + 1 == ncolors, (n, ncolors) + ax.set_xlim((0, max(xpts) + 2 * dx)) + ax.set_ylim((-0.75 * dx + min(ypts), max(ypts) + 0.75 * dx)) if not append: plt.gcf().set_size_inches((ncol * pxin, (nrow + 2) * pxin)) -def visualize_groups_1d(array, labels, axis=-1, colors=None, cmap=None): +def visualize_groups_1d(array, labels, axis=-1, colors=None, cmap=None, append=True, x0=0): """ Visualize group distribution for a 1D array of group labels. """ @@ -93,7 +95,8 @@ def visualize_groups_1d(array, labels, axis=-1, colors=None, cmap=None): if len(unique_labels) > len(colors): raise ValueError("Not enough unique colors") - plt.figure() + if not append: + fig = plt.figure() i0 = 0 for i in chunks: lab = labels[i0 : i0 + i] @@ -103,17 +106,17 @@ def visualize_groups_1d(array, labels, axis=-1, colors=None, cmap=None): len(lab) + 1, colors=col, randomize=False, - append=True, - x0=i0 * 2.3, # + (i0 - 1) * 0.025, + append=append, + x0=x0 + i0 * 2.3, # + (i0 - 1) * 0.025, ) i0 += i - pxin = 0.8 - plt.gcf().set_size_inches((len(labels) * pxin, 1 * pxin)) + if not append: + pxin = 0.8 + fig.set_size_inches((len(labels) * pxin, 1 * pxin)) def get_colormap(N): - cmap = mpl.cm.get_cmap("tab20_r").copy() ncolors = len(cmap.colors) q = N // ncolors @@ -124,21 +127,20 @@ def get_colormap(N): def factorize_cohorts(by, cohorts): - factorized = np.full(by.shape, -1) for idx, cohort in enumerate(cohorts): factorized[np.isin(by, cohort)] = idx return factorized -def visualize_cohorts_2d(by, array, method="cohorts"): +def visualize_cohorts_2d(by, array): assert by.ndim == 2 print("finding cohorts...") before_merged = find_group_cohorts( - by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=False, method=method + by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=False ).values() merged = find_group_cohorts( - by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=True, method=method + by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=True ).values() print("finished cohorts...") @@ -149,16 +151,12 @@ def visualize_cohorts_2d(by, array, method="cohorts"): ax = ax.ravel() ax[1].set_visible(False) ax = ax[[0, 2, 3]] - flat = by.ravel() - ngroups = len(np.unique(flat[~np.isnan(flat)])) + ngroups = len(_unique(by)) h0 = ax[0].imshow(by, cmap=get_colormap(ngroups)) - h1 = ax[1].imshow( - factorize_cohorts(by, before_merged), - vmin=0, - cmap=get_colormap(len(before_merged)), - ) - h2 = ax[2].imshow(factorize_cohorts(by, merged), vmin=0, cmap=get_colormap(len(merged))) + h1 = _visualize_cohorts(by, before_merged, ax=ax[1]) + h2 = _visualize_cohorts(by, merged, ax=ax[2]) + for axx in ax: axx.grid(True, which="both") axx.set_xticks(xticks) @@ -170,3 +168,26 @@ def visualize_cohorts_2d(by, array, method="cohorts"): ax[1].set_title(f"{len(before_merged)} cohorts") ax[2].set_title(f"{len(merged)} merged cohorts") f.set_size_inches((6, 6)) + + +def _visualize_cohorts(by, cohorts, ax=None): + if ax is None: + _, ax = plt.subplots(1, 1) + + ax.imshow(factorize_cohorts(by, cohorts), vmin=0, cmap=get_colormap(len(cohorts))) + + +def visualize_groups_2d(labels, y0=0, **kwargs): + colors = mpl.cm.tab10_r + for i, chunk in enumerate(labels): + chunk = np.atleast_2d(chunk) + draw_mesh( + *chunk.shape, + colors=tuple(colors(label) for label in np.flipud(chunk).ravel()), + randomize=False, + append=True, + y0=y0, + **kwargs, + ) + y0 = y0 + 2 * chunk.shape[0] + 2 + plt.ylim([-1, y0]) diff --git a/flox/xarray.py b/flox/xarray.py index 55eefd812..487850ca0 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence, Union import numpy as np @@ -13,31 +12,19 @@ from .core import ( _convert_expected_groups_to_index, _get_expected_groups, + _validate_expected_groups, groupby_reduce, - rechunk_for_blockwise as rechunk_array_for_blockwise, - rechunk_for_cohorts as rechunk_array_for_cohorts, ) +from .core import rechunk_for_blockwise as rechunk_array_for_blockwise +from .core import rechunk_for_cohorts as rechunk_array_for_cohorts from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric if TYPE_CHECKING: - from xarray.core.resample import Resample from xarray.core.types import T_DataArray, T_Dataset - Dims = Union[str, Iterable[Hashable], None] - + from .core import T_ExpectedGroupsOpt, T_ExpectIndex, T_ExpectOpt -def _get_input_core_dims(group_names, dim, ds, grouper_dims): - input_core_dims = [[], []] - for g in group_names: - if g in dim: - continue - if g in ds.dims: - input_core_dims[0].extend([g]) - if g in grouper_dims: - input_core_dims[1].extend([g]) - input_core_dims[0].extend(dim) - input_core_dims[1].extend(dim) - return input_core_dims + Dims = Union[str, Iterable[Hashable], None] def _restore_dim_order(result, obj, by): @@ -54,11 +41,31 @@ def lookup_order(dimension): return result.transpose(*new_order) +def _broadcast_size_one_dims(*arrays, core_dims): + """Broadcast by adding size-1 dimensions in the right place. + + Workaround because apply_ufunc doesn't support this yet. + https://github.com/pydata/xarray/issues/3032#issuecomment-503337637 + + Specialized to the groupby problem. + """ + array_dims = set(core_dims[0]) + broadcasted = [arrays[0]] + for dims, array in zip(core_dims[1:], arrays[1:]): + assert set(dims).issubset(array_dims) + order = [dims.index(d) for d in core_dims[0] if d in dims] + array = array.transpose(*order) + axis = [core_dims[0].index(d) for d in core_dims[0] if d not in dims] + broadcasted.append(np.expand_dims(array, axis)) + + return broadcasted + + def xarray_reduce( obj: T_Dataset | T_DataArray, *by: T_DataArray | Hashable, func: str | Aggregation, - expected_groups=None, + expected_groups: T_ExpectedGroupsOpt = None, isbin: bool | Sequence[bool] = False, sort: bool = True, dim: Dims | ellipsis = None, @@ -97,7 +104,7 @@ def xarray_reduce( fill_value Value used for missing groups in the output i.e. when one of the labels in ``expected_groups`` is not actually present in ``by``. - dtype: data-type, optional + dtype : data-type, optional DType for the output. Can be anything accepted by ``np.dtype``. method : {"map-reduce", "blockwise", "cohorts", "split-reduce"}, optional Strategy for reduction of dask arrays only: @@ -155,7 +162,7 @@ def xarray_reduce( and the final result is a simple reduction of those intermediates. In nearly all cases, this is a significant boost in computation speed. For cases like time grouping, this may result in large intermediates relative to the original block size. Avoid that by using method="cohorts". By default, it is turned off for arg reductions. - **finalize_kwargs : + **finalize_kwargs kwargs passed to the finalize function, like ``ddof`` for var, std. Returns @@ -210,19 +217,13 @@ def xarray_reduce( else: isbins = (isbin,) * nby - if expected_groups is None: - expected_groups = (None,) * nby - if isinstance(expected_groups, (np.ndarray, list)): # TODO: test for list - if nby == 1: - expected_groups = (expected_groups,) - else: - raise ValueError("Needs better message.") + expected_groups_valid = _validate_expected_groups(nby, expected_groups) if not sort: - raise NotImplementedError + raise NotImplementedError("sort must be True for xarray_reduce") # eventually drop the variables we are grouping by - maybe_drop = [b for b in by if isinstance(b, Hashable)] + maybe_drop = {b for b in by if isinstance(b, Hashable)} unindexed_dims = tuple( b for b, isbin_ in zip(by, isbins) @@ -242,7 +243,19 @@ def xarray_reduce( else: ds = obj._to_temp_dataset() - ds = ds.drop_vars([var for var in maybe_drop if var in ds.variables]) + try: + from xarray.indexes import PandasMultiIndex + except ImportError: + PandasMultiIndex = tuple() # type: ignore + + more_drop = set() + for var in maybe_drop: + maybe_midx = ds._indexes.get(var, None) + if isinstance(maybe_midx, PandasMultiIndex): + idx_coord_names = set(maybe_midx.index.names + [maybe_midx.dim]) + idx_other_names = idx_coord_names - set(maybe_drop) + more_drop.update(idx_other_names) + maybe_drop.update(more_drop) if dim is Ellipsis: if nby > 1: @@ -255,24 +268,29 @@ def xarray_reduce( elif dim is not None: dim_tuple = _atleast_1d(dim) else: - dim_tuple = tuple() + dim_tuple = tuple(grouper_dims) - # broadcast all variables against each other along all dimensions in `by` variables - # don't exclude `dim` because it need not be a dimension in any of the `by` variables! - # in the case where dim is Ellipsis, and by.ndim < obj.ndim - # then we also broadcast `by` to all `obj.dims` - # TODO: avoid this broadcasting + # broadcast to make sure grouper dimensions are present in the array. exclude_dims = tuple(d for d in ds.dims if d not in grouper_dims and d not in dim_tuple) - ds_broad, *by_broad = xr.broadcast(ds, *by_da, exclude=exclude_dims) - - # all members of by_broad have the same dimensions - # so we just pull by_broad[0].dims if dim is None - if not dim_tuple: - dim_tuple = tuple(by_broad[0].dims) if any(d not in grouper_dims and d not in obj.dims for d in dim_tuple): raise ValueError(f"Cannot reduce over absent dimensions {dim}.") + try: + xr.align(ds, *by_da, join="exact", copy=False) + except ValueError as e: + raise ValueError( + "Object being grouped must be exactly aligned with every array in `by`." + ) from e + + needs_broadcast = any( + not set(grouper_dims).issubset(set(variable.dims)) for variable in ds.data_vars.values() + ) + if needs_broadcast: + ds_broad = xr.broadcast(ds, *by_da, exclude=exclude_dims)[0] + else: + ds_broad = ds + dims_not_in_groupers = tuple(d for d in dim_tuple if d not in grouper_dims) if dims_not_in_groupers == tuple(dim_tuple) and not any(isbins): # reducing along a dimension along which groups do not vary @@ -291,44 +309,52 @@ def xarray_reduce( else: return result + ds = ds.drop_vars([var for var in maybe_drop if var in ds.variables]) + axis = tuple(range(-len(dim_tuple), 0)) # Set expected_groups and convert to index since we need coords, sizes # for output xarray objects - expected_groups = list(expected_groups) + expected_groups_valid_list: list[T_ExpectIndex] = [] group_names: tuple[Any, ...] = () group_sizes: dict[Any, int] = {} - for idx, (b_, expect, isbin_) in enumerate(zip(by_broad, expected_groups, isbins)): - group_name = b_.name if not isbin_ else f"{b_.name}_bins" + for idx, (b_, expect, isbin_) in enumerate(zip(by_da, expected_groups_valid, isbins)): + group_name = ( + f"{b_.name}_bins" if isbin_ or isinstance(expect, pd.IntervalIndex) else b_.name + ) group_names += (group_name,) if isbin_ and isinstance(expect, int): raise NotImplementedError( "flox does not support binning into an integer number of bins yet." ) + + expect1: T_ExpectOpt if expect is None: if isbin_: raise ValueError( f"Please provided bin edges for group variable {idx} " f"named {group_name} in expected_groups." ) - expect_ = _get_expected_groups(b_.data, sort=sort) + expect1 = _get_expected_groups(b_.data, sort=sort) else: - expect_ = expect - expect_index = _convert_expected_groups_to_index((expect_,), (isbin_,), sort=sort)[0] + expect1 = expect + expect_index = _convert_expected_groups_to_index((expect1,), (isbin_,), sort=sort)[0] # The if-check is for type hinting mainly, it narrows down the return # type of _convert_expected_groups_to_index to pure pd.Index: if expect_index is not None: - expected_groups[idx] = expect_index + expected_groups_valid_list.append(expect_index) group_sizes[group_name] = len(expect_index) else: # This will never be reached raise ValueError("expect_index cannot be None") - def wrapper(array, *by, func, skipna, **kwargs): + def wrapper(array, *by, func, skipna, core_dims, **kwargs): + array, *by = _broadcast_size_one_dims(array, *by, core_dims=core_dims) + # Handle skipna here because I need to know dtype to make a good default choice. - # We cannnot handle this easily for xarray Datasets in xarray_reduce + # We cannot handle this easily for xarray Datasets in xarray_reduce if skipna and func in ["all", "any", "count"]: raise ValueError(f"skipna cannot be truthy for {func} reductions.") @@ -348,8 +374,8 @@ def wrapper(array, *by, func, skipna, **kwargs): # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" array = datetime_to_numeric(array, offset) - elif _contains_cftime_datetimes(array): - offset = min(array) + elif is_cftime: + offset = array.min() array = datetime_to_numeric(array, offset, datetime_unit="us") result, *groups = groupby_reduce(array, *by, func=func, **kwargs) @@ -374,17 +400,21 @@ def wrapper(array, *by, func, skipna, **kwargs): if is_missing_dim: missing_dim[k] = v - input_core_dims = _get_input_core_dims(group_names, dim_tuple, ds_broad, grouper_dims) - input_core_dims += [input_core_dims[-1]] * (nby - 1) + # dim_tuple contains dimensions we are reducing over. These need to be the last + # core dimensions to be synchronized with axis. + input_core_dims = [[d for d in grouper_dims if d not in dim_tuple] + list(dim_tuple)] + input_core_dims += [list(b.dims) for b in by_da] + output_core_dims = [d for d in input_core_dims[0] if d not in dim_tuple] + output_core_dims.extend(group_names) actual = xr.apply_ufunc( wrapper, ds_broad.drop_vars(tuple(missing_dim)).transpose(..., *grouper_dims), - *by_broad, + *by_da, input_core_dims=input_core_dims, # for xarray's test_groupby_duplicate_coordinate_labels exclude_dims=set(dim_tuple), - output_core_dims=[group_names], + output_core_dims=[output_core_dims], dask="allowed", dask_gufunc_kwargs=dict( output_sizes=group_sizes, output_dtypes=[dtype] if dtype is not None else None @@ -400,35 +430,48 @@ def wrapper(array, *by, func, skipna, **kwargs): "skipna": skipna, "engine": engine, "reindex": reindex, - "expected_groups": tuple(expected_groups), + "expected_groups": tuple(expected_groups_valid_list), "isbin": isbins, "finalize_kwargs": finalize_kwargs, "dtype": dtype, + "core_dims": input_core_dims, }, ) # restore non-dim coord variables without the core dimension # TODO: shouldn't apply_ufunc handle this? - for var in set(ds_broad.variables) - set(ds_broad.dims): + for var in set(ds_broad._coord_names) - set(ds_broad._indexes) - set(ds_broad.dims): if all(d not in ds_broad[var].dims for d in dim_tuple): actual[var] = ds_broad[var] - for name, expect, by_ in zip(group_names, expected_groups, by_broad): - # Can't remove this till xarray handles IntervalIndex - if isinstance(expect, pd.IntervalIndex): - expect = expect.to_numpy() + expect3: T_ExpectIndex | np.ndarray + for name, expect2, by_ in zip(group_names, expected_groups_valid_list, by_da): + # Can't remove this until xarray handles IntervalIndex: + if isinstance(expect2, pd.IntervalIndex): + # TODO: Only place where expect3 is an ndarray, remove the type if xarray + # starts supporting IntervalIndex. + expect3 = expect2.to_numpy() + else: + expect3 = expect2 if isinstance(actual, xr.Dataset) and name in actual: actual = actual.drop_vars(name) # When grouping by MultiIndex, expect is an pd.Index wrapping # an object array of tuples - if name in ds_broad.indexes and isinstance(ds_broad.indexes[name], pd.MultiIndex): + if ( + name in ds_broad.indexes + and isinstance(ds_broad.indexes[name], pd.MultiIndex) + and not isinstance(expect3, pd.RangeIndex) + ): levelnames = ds_broad.indexes[name].names - expect = pd.MultiIndex.from_tuples(expect.values, names=levelnames) - actual[name] = expect + if isinstance(expect3, np.ndarray): + # TODO: workaoround for IntervalIndex issue. + raise NotImplementedError + expect3 = pd.MultiIndex.from_tuples(expect3.values, names=levelnames) + actual[name] = expect3 if Version(xr.__version__) > Version("2022.03.0"): actual = actual.set_coords(levelnames) else: - actual[name] = expect + actual[name] = expect3 if keep_attrs: actual[name].attrs = by_.attrs @@ -443,7 +486,7 @@ def wrapper(array, *by, func, skipna, **kwargs): template = obj if actual[var].ndim > 1: - actual[var] = _restore_dim_order(actual[var], template, by_broad[0]) + actual[var] = _restore_dim_order(actual[var], template, by_da[0]) if missing_dim: for k, v in missing_dim.items(): @@ -487,7 +530,7 @@ def rechunk_for_cohorts( Labels at which we always start a new chunk. For the example ``labels`` array, this would be `1`. chunksize : int, optional - nominal chunk size. Chunk size is exceded when the label + nominal chunk size. Chunk size is exceeded when the label in ``force_new_chunk_at`` is less than ``chunksize//2`` elements away. If None, uses median chunksize along ``dim``. @@ -511,7 +554,7 @@ def rechunk_for_cohorts( def rechunk_for_blockwise(obj: T_DataArray | T_Dataset, dim: str, labels: T_DataArray): """ Rechunks array so that group boundaries line up with chunk boundaries, allowing - embarassingly parallel group reductions. + embarrassingly parallel group reductions. This only works when the groups are sequential (e.g. labels = ``[0,0,0,1,1,1,1,2,2]``). @@ -553,44 +596,3 @@ def _rechunk(func, obj, dim, labels, **kwargs): ) return obj - - -def resample_reduce( - resampler: Resample, - func: str | Aggregation, - keep_attrs: bool = True, - **kwargs, -): - - warnings.warn( - "flox.xarray.resample_reduce is now deprecated. Please use Xarray's resample method directly.", - DeprecationWarning, - ) - - obj = resampler._obj - dim = resampler._group_dim - - # this creates a label DataArray since resample doesn't do that somehow - tostack = [] - for idx, slicer in enumerate(resampler._group_indices): - if slicer.stop is None: - stop = resampler._obj.sizes[dim] - else: - stop = slicer.stop - tostack.append(idx * np.ones((stop - slicer.start,), dtype=np.int32)) - by = xr.DataArray(np.hstack(tostack), dims=(dim,), name="__resample_dim__") - - result = ( - xarray_reduce( - obj, - by, - func=func, - method="blockwise", - keep_attrs=keep_attrs, - **kwargs, - ) - .rename({"__resample_dim__": dim}) - .transpose(dim, ...) - ) - result[dim] = resampler._unique_coord.data - return result diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index b333580c4..99dd08eb7 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -31,17 +31,6 @@ def __eq__(self, other): NINF = AlwaysLessThan() -# Pairs of types that, if both found, should be promoted to object dtype -# instead of following NumPy's own type-promotion rules. These type promotion -# rules match pandas instead. For reference, see the NumPy type hierarchy: -# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html -PROMOTE_TO_OBJECT = [ - {np.number, np.character}, # numpy promotes to character - {np.bool_, np.character}, # numpy promotes to character - {np.bytes_, np.unicode_}, # numpy promotes to unicode -] - - def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote @@ -152,28 +141,3 @@ def get_neg_infinity(dtype, min_for_int=False): def is_datetime_like(dtype): """Check if a dtype is a subclass of the numpy datetime types""" return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64) - - -def result_type(*arrays_and_dtypes): - """Like np.result_type, but with type promotion rules matching pandas. - - Examples of changed behavior: - number + string -> object (not string) - bytes + unicode -> object (not unicode) - - Parameters - ---------- - *arrays_and_dtypes : list of arrays and dtypes - The dtype is extracted from both numpy and dask arrays. - - Returns - ------- - numpy.dtype for the result. - """ - types = {np.result_type(t).type for t in arrays_and_dtypes} - - for left, right in PROMOTE_TO_OBJECT: - if any(issubclass(t, left) for t in types) and any(issubclass(t, right) for t in types): - return np.dtype(object) - - return np.result_type(*arrays_and_dtypes) diff --git a/flox/xrutils.py b/flox/xrutils.py index 3e6edd89e..958bd3976 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -1,12 +1,12 @@ # The functions defined here were copied based on the source code # defined in xarray - import datetime from typing import Any, Iterable import numpy as np import pandas as pd +from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] try: import cftime @@ -157,7 +157,7 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): if array.dtype.kind in "Mm": offset = _datetime_nanmin(array) else: - offset = min(array) + offset = array.min() # Compute timedelta object. # For np.datetime64, this can silently yield garbage due to overflow. @@ -181,7 +181,6 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): # Convert np.NaT to np.nan elif array.dtype.kind in "mM": - # Convert to specified timedelta units. if datetime_unit: array = array / np.timedelta64(1, datetime_unit) @@ -284,3 +283,36 @@ def _contains_cftime_datetimes(array) -> bool: return isinstance(sample, cftime.datetime) else: return False + + +def _select_along_axis(values, idx, axis): + other_ind = np.ix_(*[np.arange(s) for s in idx.shape]) + sl = other_ind[:axis] + (idx,) + other_ind[axis:] + return values[sl] + + +def nanfirst(values, axis, keepdims=False): + if isinstance(axis, tuple): + (axis,) = axis + values = np.asarray(values) + axis = normalize_axis_index(axis, values.ndim) + idx_first = np.argmax(~pd.isnull(values), axis=axis) + result = _select_along_axis(values, idx_first, axis) + if keepdims: + return np.expand_dims(result, axis=axis) + else: + return result + + +def nanlast(values, axis, keepdims=False): + if isinstance(axis, tuple): + (axis,) = axis + values = np.asarray(values) + axis = normalize_axis_index(axis, values.ndim) + rev = (slice(None),) * axis + (slice(None, None, -1),) + idx_last = -1 - np.argmax(~pd.isnull(values)[rev], axis=axis) + result = _select_along_axis(values, idx_last, axis) + if keepdims: + return np.expand_dims(result, axis=axis) + else: + return result diff --git a/pyproject.toml b/pyproject.toml index 32e55d712..fb27ee761 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,26 +1,97 @@ +[project] +name = "flox" +description = "GroupBy operations for dask.array" +license = {file = "LICENSE"} +readme = "README.md" +requires-python = ">=3.8" +keywords = ["xarray", "dask", "groupby"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "pandas", + "numpy>=1.20", + "numpy_groupies>=0.9.19", + "toolz", +] +dynamic=["version"] + + +[project.urls] +homepage = "https://flox.readthedocs.io" +documentation = "https://flox.readthedocs.io" +repository = "https://github.com/xarray-contrib/flox.git" +changelog = "https://github.com/xarray-contrib/flox/releases" + +[project.optional-dependencies] +all = ["cachey", "dask", "numba", "xarray"] +test = ["netCDF4"] + [build-system] requires = [ - "setuptools>=42", + "pandas", + "numpy>=1.20", + "numpy_groupies>=0.9.19", + "toolz", + "setuptools>=61.0.0", "wheel", - "setuptools_scm[toml]>=3.4", - "setuptools_scm_git_archive", + "setuptools_scm[toml]>=7.0", ] build-backend = "setuptools.build_meta" +[tool.setuptools] +packages = ["flox"] + +[tool.setuptools.dynamic] +version = {attr = "flox.__version__"} + [tool.setuptools_scm] fallback_version = "999" +write_to = "flox/_version.py" +write_to_template= '__version__ = "{version}"' [tool.black] line-length = 100 target-version = ["py38"] -[tool.isort] -profile = "black" -skip_gitignore = true -float_to_top = true -combine_as_imports = true -known_first_party = "flox" -known_third_party = [ +[tool.ruff] +target-version = "py38" +builtins = ["ellipsis"] +exclude = [ + ".eggs", + "doc", +] +# E402: module level import not at top of file +# E501: line too long - let black worry about that +# E731: do not assign a lambda expression, use a def +ignore = [ + "E402", + "E501", + "E731", +] +select = [ + # Pyflakes + "F", + # Pycodestyle + "E", + "W", + # isort + "I", + # Pyupgrade + "UP", +] + +[tool.ruff.isort] +known-first-party = ["flox"] +known-third-party = [ "dask", "numpy", "numpy_groupies", @@ -33,9 +104,9 @@ known_third_party = [ [tool.mypy] allow_redefinition = true -exclude = "properties|asv_bench|doc|tests|flycheck" -files = "flox/*.py" +files = "**/*.py" show_error_codes = true +warn_unused_ignores = true [[tool.mypy.overrides]] module=[ @@ -43,7 +114,8 @@ module=[ "cftime", "dask.*", "importlib_metadata", - "numpy_groupies", + "numba", + "numpy_groupies.*", "matplotlib.*", "pandas", "setuptools", @@ -53,3 +125,8 @@ ignore_missing_imports = true [tool.pytest.ini_options] addopts = "--tb=short" + + +[tool.codespell] +ignore-words-list = "nd,nax" +skip = "*.html" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3645e5bc7..000000000 --- a/setup.cfg +++ /dev/null @@ -1,61 +0,0 @@ -[metadata] -name = flox -author = flox Developers -author_email = deepak@cherian.net -license = Apache -description = GroupBy operations for dask.array -long_description = file: README.md -long_description_content_type=text/markdown - -url = https://github.com/xarray-contrib/flox -classifiers = - Development Status :: 4 - Beta - License :: OSI Approved :: Apache Software License - Operating System :: OS Independent - Intended Audience :: Science/Research - Programming Language :: Python - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Topic :: Scientific/Engineering - -[options] -packages = find: -zip_safe = False # https://mypy.readthedocs.io/en/latest/installed_packages.html -include_package_data = True -python_requires = >=3.8 -install_requires = - pandas - numpy >= '1.20' - numpy_groupies >= '0.9.15' - toolz - -[options.extras_require] -all = - cachey - dask - xarray -test = - netCDF4 - -[flake8] -ignore = - # whitespace before ':' - doesn't work well with black - E203 - E402 - # line too long - let black worry about that - E501 - # do not assign a lambda expression, use a def - E731 - # line break before binary operator - W503 - # too complex - C901 -per-file-ignores = - tests/*.py:F401,F811 -exclude= - .eggs - doc -builtins = - ellipsis diff --git a/tests/__init__.py b/tests/__init__.py index 7cf379a35..4c04a0fc8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,8 +1,8 @@ import importlib from contextlib import contextmanager -from distutils import version import numpy as np +import packaging.version import pandas as pd import pytest @@ -42,7 +42,7 @@ def LooseVersion(vstring): # Our development version is something like '0.10.9+aac7bfc' # This function just ignored the git commit id. vstring = vstring.split("+")[0] - return version.LooseVersion(vstring) + return packaging.version.Version(vstring) has_dask, requires_dask = _importorskip("dask") @@ -125,18 +125,3 @@ def assert_equal_tuple(a, b): np.testing.assert_array_equal(a_, b_) else: assert a_ == b_ - - -@pytest.fixture(scope="module", params=["numbagg"]) -def engine(request): - if request.param == "numba": - try: - import numba - except ImportError: - pytest.xfail() - if request.param == "numbagg": - try: - import numbagg - except ImportError: - pytest.xfail() - return request.param diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..d1cc301d7 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.fixture(scope="module", params=["numbagg"]) +def engine(request): + if request.param == "numba": + try: + import numba # noqa + except ImportError: + pytest.skip() + if request.param == "numbagg": + try: + import numbagg + except ImportError: + pytest.skip() + + return request.param diff --git a/tests/test_core.py b/tests/test_core.py index e31f11e56..83b823b07 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,18 +1,22 @@ from __future__ import annotations -from functools import reduce -from typing import TYPE_CHECKING +import itertools +import warnings +from functools import partial, reduce +from typing import TYPE_CHECKING, Callable import numpy as np import pandas as pd import pytest from numpy_groupies.aggregate_numpy import aggregate +from flox import xrutils from flox.aggregations import Aggregation from flox.core import ( _convert_expected_groups_to_index, _get_optimal_chunks_for_groups, _normalize_indexes, + _validate_reindex, factorize_, find_group_cohorts, groupby_reduce, @@ -24,7 +28,6 @@ from . import ( assert_equal, assert_equal_tuple, - engine, has_dask, raise_if_dask_computes, requires_dask, @@ -35,7 +38,6 @@ nan_labels[:5] = np.nan labels2d = np.array([labels[:5], np.flip(labels[:5])]) -# isort:off if has_dask: import dask import dask.array as da @@ -48,11 +50,12 @@ def dask_array_ones(*args): return None -# isort:on - ALL_FUNCS = ( "sum", "nansum", + "argmax", + "nanfirst", + "nanargmax", "prod", "nanprod", "mean", @@ -65,18 +68,32 @@ def dask_array_ones(*args): "nanmax", "min", "nanmin", - "argmax", - pytest.param("nanargmax", marks=(pytest.mark.skip,)), "argmin", - pytest.param("nanargmin", marks=(pytest.mark.skip,)), + "nanargmin", "any", "all", + "nanlast", pytest.param("median", marks=(pytest.mark.skip,)), pytest.param("nanmedian", marks=(pytest.mark.skip,)), ) if TYPE_CHECKING: - from flox.core import T_Engine, T_ExpectedGroupsOpt, T_Func2 + from flox.core import T_Agg, T_Engine, T_ExpectedGroupsOpt, T_Method + + +def _get_array_func(func: str) -> Callable: + if func == "count": + + def npfunc(x): + x = np.asarray(x) + return (~np.isnan(x)).sum() + + elif func in ["nanfirst", "nanlast"]: + npfunc = getattr(xrutils, func) + else: + npfunc = getattr(np, func) + + return npfunc def test_alignment_error(): @@ -89,7 +106,8 @@ def test_alignment_error(): @pytest.mark.parametrize("dtype", (float, int)) @pytest.mark.parametrize("chunk", [False, True]) -@pytest.mark.parametrize("expected_groups", [None, [0, 1, 2], np.array([0, 1, 2])]) +# TODO: make this intp when python 3.8 is dropped +@pytest.mark.parametrize("expected_groups", [None, [0, 1, 2], np.array([0, 1, 2], dtype=np.int64)]) @pytest.mark.parametrize( "func, array, by, expected", [ @@ -117,7 +135,7 @@ def test_alignment_error(): ) def test_groupby_reduce( engine: T_Engine, - func: T_Func2, + func: T_Agg, array: np.ndarray, by: np.ndarray, expected: list[float], @@ -133,13 +151,13 @@ def test_groupby_reduce( by = da.from_array(by, chunks=(3,) if by.ndim == 1 else (1, 3)) if func == "mean" or func == "nanmean": - expected_result = np.array(expected, dtype=float) + expected_result = np.array(expected, dtype=np.float64) elif func == "sum": expected_result = np.array(expected, dtype=dtype) elif func == "count": - expected_result = np.array(expected, dtype=int) + expected_result = np.array(expected, dtype=np.intp) - result, groups, = groupby_reduce( + (result, groups) = groupby_reduce( array, by, func=func, @@ -147,7 +165,14 @@ def test_groupby_reduce( fill_value=123, engine=engine, ) - g_dtype = by.dtype if expected_groups is None else np.asarray(expected_groups).dtype + # we use pd.Index(expected_groups).to_numpy() which is always int64 + # for the values in this tests + if expected_groups is None: + g_dtype = by.dtype + elif isinstance(expected_groups, np.ndarray): + g_dtype = expected_groups.dtype + else: + g_dtype = np.int64 assert_equal(groups, np.array([0, 1, 2], g_dtype)) assert_equal(expected_result, result) @@ -200,11 +225,27 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): for kwargs in finalize_kwargs: flox_kwargs = dict(func=func, engine=engine, finalize_kwargs=kwargs, fill_value=fill_value) with np.errstate(invalid="ignore", divide="ignore"): - if "arg" in func and add_nan_by: - array[..., nanmask] = np.nan - expected = getattr(np, "nan" + func)(array, axis=-1, **kwargs) - else: - expected = getattr(np, func)(array[..., ~nanmask], axis=-1, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + warnings.filterwarnings("ignore", r"Degrees of freedom <= 0 for slice") + warnings.filterwarnings("ignore", r"Mean of empty slice") + + # computing silences a bunch of dask warnings + array_ = array.compute() if chunks is not None else array + if "arg" in func and add_nan_by: + # NaNs are in by, but we can't call np.argmax([..., NaN, .. ]) + # That would return index of the NaN + # This way, we insert NaNs where there are NaNs in by, and + # call np.nanargmax + func_ = f"nan{func}" if "nan" not in func else func + array_[..., nanmask] = np.nan + expected = getattr(np, func_)(array_, axis=-1, **kwargs) + # elif func in ["first", "last"]: + # expected = getattr(xrutils, f"nan{func}")(array_[..., ~nanmask], axis=-1, **kwargs) + elif func in ["nanfirst", "nanlast"]: + expected = getattr(xrutils, func)(array_[..., ~nanmask], axis=-1, **kwargs) + else: + expected = getattr(np, func)(array_[..., ~nanmask], axis=-1, **kwargs) for _ in range(nby): expected = np.expand_dims(expected, -1) @@ -218,12 +259,30 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): assert actual.dtype.kind == "i" assert_equal(actual, expected, tolerance) - if not has_dask: + if not has_dask or chunks is None: continue - for method in ["map-reduce", "cohorts", "split-reduce"]: - if "arg" in func and method != "map-reduce": + + params = list(itertools.product(["map-reduce"], [True, False, None])) + params.extend(itertools.product(["cohorts"], [False, None])) + if chunks == -1: + params.extend([("blockwise", None)]) + + for method, reindex in params: + call = partial( + groupby_reduce, array, *by, method=method, reindex=reindex, **flox_kwargs + ) + if ("arg" in func or func in ["first", "last"]) and reindex is True: + # simple_combine with argreductions not supported right now + with pytest.raises(NotImplementedError): + call() continue - actual, *groups = groupby_reduce(array, *by, method=method, **flox_kwargs) + actual, *groups = call() + if method != "blockwise": + if "arg" not in func: + # make sure we use simple combine + assert any("simple-combine" in key for key in actual.dask.layers.keys()) + else: + assert any("grouped-combine" in key for key in actual.dask.layers.keys()) for actual_group, expect in zip(groups, expected_groups): assert_equal(actual_group, expect, tolerance) if "arg" in func: @@ -258,7 +317,7 @@ def test_groupby_reduce_count(): array = np.array([0, 0, np.nan, np.nan, np.nan, 1, 1]) labels = np.array(["a", "b", "b", "b", "c", "c", "c"]) result, _ = groupby_reduce(array, labels, func="count") - assert_equal(result, [1, 1, 2]) + assert_equal(result, np.array([1, 1, 2], dtype=np.intp)) def test_func_is_aggregation(): @@ -367,53 +426,51 @@ def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtyp kwargs["expected_groups"] = [0, 2, 1] with raise_if_dask_computes(): actual, groups = groupby_reduce(array, by, engine=engine, **kwargs, sort=False) - assert_equal(groups, [0, 2, 1]) + assert_equal(groups, np.array([0, 2, 1], dtype=np.int64)) assert_equal(expected, actual[..., [0, 2, 1]]) - kwargs["expected_groups"] = [0, 2, 1] with raise_if_dask_computes(): actual, groups = groupby_reduce(array, by, engine=engine, **kwargs, sort=True) - assert_equal(groups, [0, 1, 2]) + assert_equal(groups, np.array([0, 1, 2], np.int64)) assert_equal(expected, actual) def test_numpy_reduce_axis_subset(engine): # TODO: add NaNs by = labels2d - array = np.ones_like(by) + array = np.ones_like(by, dtype=np.int64) kwargs = dict(func="count", engine=engine, fill_value=0) result, _ = groupby_reduce(array, by, **kwargs, axis=1) - assert_equal(result, [[2, 3], [2, 3]]) + assert_equal(result, np.array([[2, 3], [2, 3]], dtype=np.intp)) by = np.broadcast_to(labels2d, (3, *labels2d.shape)) array = np.ones_like(by) result, _ = groupby_reduce(array, by, **kwargs, axis=1) - subarr = np.array([[1, 1], [1, 1], [0, 2], [1, 1], [1, 1]]) + subarr = np.array([[1, 1], [1, 1], [0, 2], [1, 1], [1, 1]], dtype=np.intp) expected = np.tile(subarr, (3, 1, 1)) assert_equal(result, expected) result, _ = groupby_reduce(array, by, **kwargs, axis=2) - subarr = np.array([[2, 3], [2, 3]]) + subarr = np.array([[2, 3], [2, 3]], dtype=np.intp) expected = np.tile(subarr, (3, 1, 1)) assert_equal(result, expected) result, _ = groupby_reduce(array, by, **kwargs, axis=(1, 2)) - expected = np.array([[4, 6], [4, 6], [4, 6]]) + expected = np.array([[4, 6], [4, 6], [4, 6]], dtype=np.intp) assert_equal(result, expected) result, _ = groupby_reduce(array, by, **kwargs, axis=(2, 1)) assert_equal(result, expected) result, _ = groupby_reduce(array, by[0, ...], **kwargs, axis=(1, 2)) - expected = np.array([[4, 6], [4, 6], [4, 6]]) + expected = np.array([[4, 6], [4, 6], [4, 6]], dtype=np.intp) assert_equal(result, expected) @requires_dask def test_dask_reduce_axis_subset(): - by = labels2d - array = np.ones_like(by) + array = np.ones_like(by, dtype=np.int64) with raise_if_dask_computes(): result, _ = groupby_reduce( da.from_array(array, chunks=(2, 3)), @@ -422,11 +479,11 @@ def test_dask_reduce_axis_subset(): axis=1, expected_groups=[0, 2], ) - assert_equal(result, [[2, 3], [2, 3]]) + assert_equal(result, np.array([[2, 3], [2, 3]], dtype=np.intp)) by = np.broadcast_to(labels2d, (3, *labels2d.shape)) array = np.ones_like(by) - subarr = np.array([[1, 1], [1, 1], [123, 2], [1, 1], [1, 1]]) + subarr = np.array([[1, 1], [1, 1], [123, 2], [1, 1], [1, 1]], dtype=np.intp) expected = np.tile(subarr, (3, 1, 1)) with raise_if_dask_computes(): result, _ = groupby_reduce( @@ -439,7 +496,7 @@ def test_dask_reduce_axis_subset(): ) assert_equal(result, expected) - subarr = np.array([[2, 3], [2, 3]]) + subarr = np.array([[2, 3], [2, 3]], dtype=np.intp) expected = np.tile(subarr, (3, 1, 1)) with raise_if_dask_computes(): result, _ = groupby_reduce( @@ -460,6 +517,28 @@ def test_dask_reduce_axis_subset(): ) +@pytest.mark.parametrize("func", ["first", "last", "nanfirst", "nanlast"]) +@pytest.mark.parametrize("axis", [(0, 1)]) +def test_first_last_disallowed(axis, func): + with pytest.raises(ValueError): + groupby_reduce(np.empty((2, 3, 2)), np.ones((2, 3, 2)), func=func, axis=axis) + + +@requires_dask +@pytest.mark.parametrize("func", ["nanfirst", "nanlast"]) +@pytest.mark.parametrize("axis", [None, (0, 1, 2)]) +def test_nanfirst_nanlast_disallowed_dask(axis, func): + with pytest.raises(ValueError): + groupby_reduce(dask.array.empty((2, 3, 2)), np.ones((2, 3, 2)), func=func, axis=axis) + + +@requires_dask +@pytest.mark.parametrize("func", ["first", "last"]) +def test_first_last_disallowed_dask(func): + with pytest.raises(NotImplementedError): + groupby_reduce(dask.array.empty((2, 3, 2)), np.ones((2, 3, 2)), func=func, axis=-1) + + @requires_dask @pytest.mark.parametrize("func", ALL_FUNCS) @pytest.mark.parametrize( @@ -469,8 +548,12 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): if "arg" in func and engine == "flox": pytest.skip() - if not isinstance(axis, int) and "arg" in func and (axis is None or len(axis) > 1): - pytest.skip() + if not isinstance(axis, int): + if "arg" in func and (axis is None or len(axis) > 1): + pytest.skip() + if ("first" in func or "last" in func) and (axis is not None and len(axis) not in [1, 3]): + pytest.skip() + if func in ["all", "any"]: fill_value = False else: @@ -487,21 +570,49 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): kwargs = dict( func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value, engine=engine ) + expected, _ = groupby_reduce(array, by, **kwargs) + if engine == "flox": + kwargs.pop("engine") + expected_npg, _ = groupby_reduce(array, by, **kwargs, engine="numpy") + assert_equal(expected_npg, expected) + + if func in ["all", "any"]: + fill_value = False + else: + fill_value = 123 + + if "var" in func or "std" in func: + tolerance = {"rtol": 1e-14, "atol": 1e-16} + else: + tolerance = None + # tests against the numpy output to make sure dask compute matches + by = np.broadcast_to(labels2d, (3, *labels2d.shape)) + rng = np.random.default_rng(12345) + array = rng.random(by.shape) + kwargs = dict( + func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value, engine=engine + ) + expected, _ = groupby_reduce(array, by, **kwargs) + if engine == "flox": + kwargs.pop("engine") + expected_npg, _ = groupby_reduce(array, by, **kwargs, engine="numpy") + assert_equal(expected_npg, expected) + + if ("first" in func or "last" in func) and ( + axis is None or (not isinstance(axis, int) and len(axis) != 1) + ): + return + with raise_if_dask_computes(): actual, _ = groupby_reduce( da.from_array(array, chunks=(-1, 2, 3)), da.from_array(by, chunks=(-1, 2, 2)), **kwargs, ) - expected, _ = groupby_reduce(array, by, **kwargs) - if engine == "flox": - kwargs.pop("engine") - expected_npg, _ = groupby_reduce(array, by, **kwargs, engine="numpy") - assert_equal(expected_npg, expected) assert_equal(actual, expected, tolerance) -@pytest.mark.parametrize("chunks", [None, (2, 2, 3)]) +@pytest.mark.parametrize("reindex,chunks", [(None, None), (False, (2, 2, 3)), (True, (2, 2, 3))]) @pytest.mark.parametrize( "axis, groups, expected_shape", [ @@ -510,7 +621,7 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): (None, [0], (1,)), # global reduction; 0 shaped group axis; 1 group ], ) -def test_groupby_reduce_nans(chunks, axis, groups, expected_shape, engine): +def test_groupby_reduce_nans(reindex, chunks, axis, groups, expected_shape, engine): def _maybe_chunk(arr): if chunks: if not has_dask: @@ -533,6 +644,7 @@ def _maybe_chunk(arr): axis=axis, fill_value=0, engine=engine, + reindex=reindex, ) assert_equal(result, np.zeros(expected_shape, dtype=np.intp)) @@ -545,7 +657,10 @@ def _maybe_chunk(arr): @requires_dask -def test_groupby_all_nan_blocks(engine): +@pytest.mark.parametrize( + "expected_groups, reindex", [(None, None), (None, False), ([0, 1, 2], True), ([0, 1, 2], False)] +) +def test_groupby_all_nan_blocks_dask(expected_groups, reindex, engine): labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0]) nan_labels = labels.astype(float) # copy nan_labels[:5] = np.nan @@ -560,8 +675,10 @@ def test_groupby_all_nan_blocks(engine): da.from_array(array, chunks=(1, 3)), da.from_array(by, chunks=(1, 3)), func="sum", - expected_groups=None, + expected_groups=expected_groups, engine=engine, + reindex=reindex, + method="map-reduce", ) assert_equal(actual, expected) @@ -613,14 +730,21 @@ def test_npg_nanarg_bug(func): assert_equal(actual, expected) -@pytest.mark.parametrize("method", ["split-reduce", "cohorts", "map-reduce"]) +@pytest.mark.parametrize( + "kwargs", + ( + dict(expected_groups=np.array([1, 2, 4, 5]), isbin=True), + dict(expected_groups=pd.IntervalIndex.from_breaks([1, 2, 4, 5])), + ), +) +@pytest.mark.parametrize("method", ["cohorts", "map-reduce"]) @pytest.mark.parametrize("chunk_labels", [False, True]) @pytest.mark.parametrize("chunks", ((), (1,), (2,))) -def test_groupby_bins(chunk_labels, chunks, engine, method) -> None: +def test_groupby_bins(chunk_labels, kwargs, chunks, engine, method) -> None: array = [1, 1, 1, 1, 1, 1] labels = [0.2, 1.5, 1.9, 2, 3, 20] - if method in ["split-reduce", "cohorts"] and chunk_labels: + if method == "cohorts" and chunk_labels: pytest.xfail() if chunks: @@ -632,16 +756,9 @@ def test_groupby_bins(chunk_labels, chunks, engine, method) -> None: with raise_if_dask_computes(): actual, groups = groupby_reduce( - array, - labels, - func="count", - expected_groups=np.array([1, 2, 4, 5]), - isbin=True, - fill_value=0, - engine=engine, - method=method, + array, labels, func="count", fill_value=0, engine=engine, method=method, **kwargs ) - expected = np.array([3, 1, 0]) + expected = np.array([3, 1, 0], dtype=np.intp) for left, right in zip(groups, pd.IntervalIndex.from_arrays([1, 2, 4], [2, 4, 5]).to_numpy()): assert left == right assert_equal(actual, expected) @@ -719,15 +836,7 @@ def test_fill_value_behaviour(func, chunks, fill_value, engine): if chunks is not None and not has_dask: pytest.skip() - if func == "count": - - def npfunc(x): - x = np.asarray(x) - return (~np.isnan(x)).sum() - - else: - npfunc = getattr(np, func) - + npfunc = _get_array_func(func) by = np.array([1, 2, 3, 1, 2, 3]) array = np.array([np.nan, 1, 1, np.nan, 1, 1]) if chunks: @@ -735,7 +844,9 @@ def npfunc(x): actual, _ = groupby_reduce( array, by, func=func, engine=engine, fill_value=fill_value, expected_groups=[0, 1, 2, 3] ) - expected = np.array([fill_value, fill_value, npfunc([1.0, 1.0]), npfunc([1.0, 1.0])]) + expected = np.array( + [fill_value, fill_value, npfunc([1.0, 1.0], axis=0), npfunc([1.0, 1.0], axis=0)] + ) assert_equal(actual, expected) @@ -758,15 +869,21 @@ def test_dtype_preservation(dtype, func, engine): @requires_dask -@pytest.mark.parametrize("method", ["split-reduce", "map-reduce", "cohorts"]) -def test_cohorts(method): - repeats = [4, 4, 12, 2, 3, 4] - labels = np.repeat(np.arange(6), repeats) - array = dask.array.from_array(labels, chunks=(4, 8, 4, 9, 4)) +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.parametrize("labels_dtype", [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.parametrize("method", ["map-reduce", "cohorts"]) +def test_cohorts_map_reduce_consistent_dtypes(method, dtype, labels_dtype): + repeats = np.array([4, 4, 12, 2, 3, 4], dtype=np.int32) + labels = np.repeat(np.arange(6, dtype=labels_dtype), repeats) + array = dask.array.from_array(labels.astype(dtype), chunks=(4, 8, 4, 9, 4)) actual, actual_groups = groupby_reduce(array, labels, func="count", method=method) - assert_equal(actual_groups, np.arange(6)) - assert_equal(actual, repeats) + assert_equal(actual_groups, np.arange(6, dtype=labels.dtype)) + assert_equal(actual, repeats.astype(np.intp)) + + actual, actual_groups = groupby_reduce(array, labels, func="sum", method=method) + assert_equal(actual_groups, np.arange(6, dtype=labels.dtype)) + assert_equal(actual, np.array([0, 4, 24, 6, 12, 20], dtype)) @requires_dask @@ -778,7 +895,7 @@ def test_cohorts_nd_by(func, method, axis, engine): o2 = dask.array.ones((2, 3), chunks=-1) array = dask.array.block([[o, 2 * o], [3 * o2, 4 * o2]]) - by = array.compute().astype(int) + by = array.compute().astype(np.int64) by[0, 1] = 30 by[2, 1] = 40 by[0, 4] = 31 @@ -794,6 +911,8 @@ def test_cohorts_nd_by(func, method, axis, engine): if axis is not None and method != "map-reduce": pytest.xfail() + if axis is None and ("first" in func or "last" in func): + pytest.skip() kwargs = dict(func=func, engine=engine, method=method, axis=axis, fill_value=fill_value) actual, groups = groupby_reduce(array, by, **kwargs) @@ -802,10 +921,7 @@ def test_cohorts_nd_by(func, method, axis, engine): assert_equal(actual, expected) actual, groups = groupby_reduce(array, by, sort=False, **kwargs) - if method == "map-reduce": - assert_equal(groups, [1, 30, 2, 31, 3, 4, 40]) - else: - assert_equal(groups, [1, 30, 2, 31, 3, 40, 4]) + assert_equal(groups, np.array([1, 30, 2, 31, 3, 4, 40], dtype=np.int64)) reindexed = reindex_(actual, groups, pd.Index(sorted_groups)) assert_equal(reindexed, expected) @@ -848,9 +964,10 @@ def test_datetime_binning(): expected = pd.IntervalIndex.from_arrays(time_bins[:-1], time_bins[1:]) assert_equal(actual, expected) - ret = factorize_((by.to_numpy(),), axis=0, expected_groups=(actual,)) + ret = factorize_((by.to_numpy(),), axes=(0,), expected_groups=(actual,)) group_idx = ret[0] - expected = pd.cut(by, time_bins).codes.copy() + # Ignore pd.cut's dtype as it won't match np.digitize: + expected = pd.cut(by, time_bins).codes.copy().astype(group_idx.dtype) expected[0] = 14 # factorize doesn't return -1 for nans assert_equal(group_idx, expected) @@ -861,7 +978,8 @@ def test_bool_reductions(func, engine): pytest.skip() groups = np.array([1, 1, 1]) data = np.array([True, True, False]) - expected = np.expand_dims(getattr(np, func)(data), -1) + npfunc = _get_array_func(func) + expected = np.expand_dims(npfunc(data, axis=0), -1) actual, _ = groupby_reduce(data, groups, func=func, engine=engine) assert_equal(expected, actual) @@ -874,14 +992,14 @@ def test_map_reduce_blockwise_mixed() -> None: dask.array.from_array(data.values, chunks=365), t.dt.month, func="mean", - method="split-reduce", + method="map-reduce", ) expected, _ = groupby_reduce(data, t.dt.month, func="mean") assert_equal(expected, actual) @requires_dask -@pytest.mark.parametrize("method", ["split-reduce", "blockwise", "map-reduce", "cohorts"]) +@pytest.mark.parametrize("method", ["blockwise", "map-reduce", "cohorts"]) def test_group_by_datetime(engine, method): kwargs = dict( func="mean", @@ -916,10 +1034,10 @@ def test_group_by_datetime(engine, method): def test_factorize_values_outside_bins(): - + # pd.factorize returns intp vals = factorize_( (np.arange(10).reshape(5, 2), np.arange(10).reshape(5, 2)), - axis=(0, 1), + axes=(0, 1), expected_groups=( pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), @@ -928,67 +1046,140 @@ def test_factorize_values_outside_bins(): fastpath=True, ) actual = vals[0] - expected = np.array([[-1, -1], [-1, 0], [6, 12], [18, 24], [-1, -1]]) + expected = np.array([[-1, -1], [-1, 0], [6, 12], [18, 24], [-1, -1]], np.intp) assert_equal(expected, actual) -def test_multiple_groupers() -> None: +@pytest.mark.parametrize("chunk", [True, False]) +def test_multiple_groupers_bins(chunk) -> None: + if chunk and not has_dask: + pytest.skip() + + xp = dask.array if chunk else np + array_kwargs = {"chunks": 2} if chunk else {} + array = xp.ones((5, 2), **array_kwargs, dtype=np.int64) + actual, *_ = groupby_reduce( - np.ones((5, 2)), - np.arange(10).reshape(5, 2), + array, np.arange(10).reshape(5, 2), + xp.arange(10).reshape(5, 2), axis=(0, 1), expected_groups=( pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), pd.IntervalIndex.from_breaks(np.arange(2, 8, 1)), ), - reindex=True, func="count", ) - expected = np.eye(5, 5, dtype=int) + # output from `count` is intp + expected = np.eye(5, 5, dtype=np.intp) + assert_equal(expected, actual) + + +@pytest.mark.parametrize("expected_groups", [None, (np.arange(5), [2, 3]), (None, [2, 3])]) +@pytest.mark.parametrize( + "by1", [np.arange(5)[:, None], np.broadcast_to(np.arange(5)[:, None], (5, 2))] +) +@pytest.mark.parametrize( + "by2", + [ + np.arange(2, 4).reshape(1, 2), + np.broadcast_to(np.arange(2, 4).reshape(1, 2), (5, 2)), + np.arange(2, 4).reshape(1, 2), + ], +) +@pytest.mark.parametrize("chunk", [True, False]) +def test_multiple_groupers(chunk, by1, by2, expected_groups) -> None: + if chunk and (not has_dask or expected_groups is None): + pytest.skip() + + xp = dask.array if chunk else np + array_kwargs = {"chunks": 2} if chunk else {} + array = xp.ones((5, 2), **array_kwargs, dtype=np.int64) + + if chunk: + by2 = dask.array.from_array(by2) + + # output from `count` is intp + expected = np.ones((5, 2), dtype=np.intp) + actual, *_ = groupby_reduce( + array, by1, by2, axis=(0, 1), func="count", expected_groups=expected_groups + ) assert_equal(expected, actual) +@pytest.mark.parametrize( + "expected_groups", + ( + [None, None, None], + (None,), + ), +) +def test_validate_expected_groups(expected_groups): + with pytest.raises(ValueError): + groupby_reduce( + np.ones((10,)), + np.ones((10,)), + np.ones((10,)), + expected_groups=expected_groups, + func="mean", + ) + + +@requires_dask +def test_validate_expected_groups_not_none_dask() -> None: + with pytest.raises(ValueError): + groupby_reduce( + dask.array.ones((5, 2)), + np.arange(10).reshape(5, 2), + dask.array.arange(10).reshape(5, 2), + axis=(0, 1), + expected_groups=None, + func="count", + ) + + def test_factorize_reindex_sorting_strings(): + # pd.factorize seems to return intp so int32 on 32bit arch kwargs = dict( by=(np.array(["El-Nino", "La-Nina", "boo", "Neutral"]),), - axis=-1, + axes=(-1,), expected_groups=(np.array(["El-Nino", "Neutral", "foo", "La-Nina"]),), ) expected = factorize_(**kwargs, reindex=True, sort=True)[0] - assert_equal(expected, [0, 1, 4, 2]) + assert_equal(expected, np.array([0, 1, 4, 2], dtype=np.intp)) expected = factorize_(**kwargs, reindex=True, sort=False)[0] - assert_equal(expected, [0, 3, 4, 1]) + assert_equal(expected, np.array([0, 3, 4, 1], dtype=np.intp)) expected = factorize_(**kwargs, reindex=False, sort=False)[0] - assert_equal(expected, [0, 1, 2, 3]) + assert_equal(expected, np.array([0, 1, 2, 3], dtype=np.intp)) expected = factorize_(**kwargs, reindex=False, sort=True)[0] - assert_equal(expected, [0, 1, 3, 2]) + assert_equal(expected, np.array([0, 1, 3, 2], dtype=np.intp)) def test_factorize_reindex_sorting_ints(): + # pd.factorize seems to return intp so int32 on 32bit arch kwargs = dict( by=(np.array([-10, 1, 10, 2, 3, 5]),), - axis=-1, - expected_groups=(np.array([0, 1, 2, 3, 4, 5]),), + axes=(-1,), + expected_groups=(np.array([0, 1, 2, 3, 4, 5], np.int64),), ) expected = factorize_(**kwargs, reindex=True, sort=True)[0] - assert_equal(expected, [6, 1, 6, 2, 3, 5]) + assert_equal(expected, np.array([6, 1, 6, 2, 3, 5], dtype=np.intp)) expected = factorize_(**kwargs, reindex=True, sort=False)[0] - assert_equal(expected, [6, 1, 6, 2, 3, 5]) + assert_equal(expected, np.array([6, 1, 6, 2, 3, 5], dtype=np.intp)) kwargs["expected_groups"] = (np.arange(5, -1, -1),) expected = factorize_(**kwargs, reindex=True, sort=True)[0] - assert_equal(expected, [6, 1, 6, 2, 3, 5]) + assert_equal(expected, np.array([6, 1, 6, 2, 3, 5], dtype=np.intp)) expected = factorize_(**kwargs, reindex=True, sort=False)[0] - assert_equal(expected, [6, 4, 6, 3, 2, 0]) + assert_equal(expected, np.array([6, 4, 6, 3, 2, 0], dtype=np.intp)) @requires_dask @@ -1125,3 +1316,152 @@ def test_subset_block_2d(flatblocks, expectidx): subset = subset_to_blocks(array, flatblocks) assert len(subset.dask.layers) == 2 assert_equal(subset, array.compute()[expectidx]) + + +@pytest.mark.parametrize( + "dask_expected, reindex, func, expected_groups, any_by_dask", + [ + # argmax only False + [False, None, "argmax", None, False], + # True when by is numpy but expected is None + [True, None, "sum", None, False], + # False when by is dask but expected is None + [False, None, "sum", None, True], + # if expected_groups then always True + [True, None, "sum", [1, 2, 3], False], + [True, None, "sum", ([1], [2]), False], + [True, None, "sum", ([1], [2]), True], + [True, None, "sum", ([1], None), False], + [True, None, "sum", ([1], None), True], + ], +) +def test_validate_reindex_map_reduce( + dask_expected, reindex, func, expected_groups, any_by_dask +) -> None: + actual = _validate_reindex( + reindex, func, "map-reduce", expected_groups, any_by_dask, is_dask_array=True + ) + assert actual is dask_expected + + # always reindex with all numpy inputs + actual = _validate_reindex( + reindex, func, "map-reduce", expected_groups, any_by_dask=False, is_dask_array=False + ) + assert actual + + actual = _validate_reindex( + True, func, "map-reduce", expected_groups, any_by_dask=False, is_dask_array=False + ) + assert actual + + +def test_validate_reindex() -> None: + methods: list[T_Method] = ["map-reduce", "cohorts"] + for method in methods: + with pytest.raises(NotImplementedError): + _validate_reindex( + True, "argmax", method, expected_groups=None, any_by_dask=False, is_dask_array=True + ) + + methods: list[T_Method] = ["blockwise", "cohorts"] + for method in methods: + with pytest.raises(ValueError): + _validate_reindex( + True, "sum", method, expected_groups=None, any_by_dask=False, is_dask_array=True + ) + + for func in ["sum", "argmax"]: + actual = _validate_reindex( + None, func, method, expected_groups=None, any_by_dask=False, is_dask_array=True + ) + assert actual is False + + +@requires_dask +def test_1d_blockwise_sort_optimization(): + # Make sure for resampling problems sorting isn't done. + time = pd.Series(pd.date_range("2020-09-01", "2020-12-31 23:59", freq="3H")) + array = dask.array.ones((len(time),), chunks=(224,)) + + actual, _ = groupby_reduce(array, time.dt.dayofyear.values, method="blockwise", func="count") + assert all("getitem" not in k for k in actual.dask) + + actual, _ = groupby_reduce( + array, time.dt.dayofyear.values[::-1], sort=True, method="blockwise", func="count" + ) + assert any("getitem" in k for k in actual.dask.layers) + + actual, _ = groupby_reduce( + array, time.dt.dayofyear.values[::-1], sort=False, method="blockwise", func="count" + ) + assert all("getitem" not in k for k in actual.dask.layers) + + +@requires_dask +def test_negative_index_factorize_race_condition(): + # shape = (10, 2000) + # chunks = ((shape[0]-1,1), 10) + shape = (101, 174000) + chunks = ((101,), 8760) + eps = dask.array.random.random_sample(shape, chunks=chunks) + N2 = dask.array.random.random_sample(shape, chunks=chunks) + S2 = dask.array.random.random_sample(shape, chunks=chunks) + + bins = np.arange(-5, -2.05, 0.1) + func = ["mean", "count", "sum"] + + out = [ + groupby_reduce( + eps, + N2, + S2, + func=f, + expected_groups=(bins, bins), + isbin=(True, True), + ) + for f in func + ] + [dask.compute(out, scheduler="threads") for _ in range(5)] + + +@pytest.mark.parametrize("sort", [True, False]) +def test_expected_index_conversion_passthrough_range_index(sort): + index = pd.RangeIndex(100) + actual = _convert_expected_groups_to_index( + expected_groups=(index,), isbin=(False,), sort=(sort,) + ) + assert actual[0] is index + + +def test_method_check_numpy(): + bins = [-2, -1, 0, 1, 2] + field = np.ones((5, 3)) + by = np.array([[-1.5, -1.5, 0.5, 1.5, 1.5] * 3]).reshape(5, 3) + actual, _ = groupby_reduce( + field, + by, + expected_groups=pd.IntervalIndex.from_breaks(bins), + func="count", + method="cohorts", + fill_value=np.nan, + ) + expected = np.array([6, np.nan, 3, 6]) + assert_equal(actual, expected) + + actual, _ = groupby_reduce( + field, + by, + expected_groups=pd.IntervalIndex.from_breaks(bins), + func="count", + fill_value=np.nan, + method="cohorts", + axis=0, + ) + expected = np.array( + [ + [2.0, np.nan, 1.0, 2.0], + [2.0, np.nan, 1.0, 2.0], + [2.0, np.nan, 1.0, 2.0], + ] + ) + assert_equal(actual, expected) diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 0bee41c15..7a343d962 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -6,16 +6,14 @@ xr = pytest.importorskip("xarray") # isort: on -from flox.xarray import rechunk_for_blockwise, resample_reduce, xarray_reduce +from flox.xarray import rechunk_for_blockwise, xarray_reduce -from . import assert_equal, engine, has_dask, raise_if_dask_computes, requires_dask +from . import assert_equal, has_dask, raise_if_dask_computes, requires_dask -# isort: off if has_dask: import dask dask.config.set(scheduler="sync") -# isort: on try: # Should test against legacy xarray implementation @@ -168,17 +166,26 @@ def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine): @requires_dask -def test_dask_groupers_error(): +@pytest.mark.parametrize( + "expected_groups", + (None, (None, None), [[1, 2], [1, 2]]), +) +def test_validate_expected_groups(expected_groups): da = xr.DataArray( [1.0, 2.0], dims="x", coords={"labels": ("x", [1, 2]), "labels2": ("x", [1, 2])} ) with pytest.raises(ValueError): - xarray_reduce(da.chunk({"x": 2, "z": 1}), "labels", "labels2", func="count") + xarray_reduce( + da.chunk({"x": 1}), + "labels", + "labels2", + func="count", + expected_groups=expected_groups, + ) @requires_dask def test_xarray_reduce_single_grouper(engine): - # DataArray ds = xr.tutorial.open_dataset("rasm", chunks={"time": 9}) actual = xarray_reduce(ds.Tair, ds.time.dt.month, func="mean", engine=engine) @@ -223,7 +230,6 @@ def test_xarray_reduce_single_grouper(engine): def test_xarray_reduce_errors(): - da = xr.DataArray(np.ones((12,)), dims="x") by = xr.DataArray(np.ones((12,)), dims="x") @@ -239,47 +245,6 @@ def test_xarray_reduce_errors(): xarray_reduce(da, by.chunk(), func="mean") -@pytest.mark.parametrize("isdask", [True, False]) -@pytest.mark.parametrize("dataarray", [True, False]) -@pytest.mark.parametrize("chunklen", [27, 4 * 31 + 1, 4 * 31 + 20]) -def test_xarray_resample(chunklen, isdask, dataarray, engine): - if isdask: - if not has_dask: - pytest.skip() - ds = xr.tutorial.open_dataset("air_temperature", chunks={"time": chunklen}) - else: - ds = xr.tutorial.open_dataset("air_temperature") - - if dataarray: - ds = ds.air - - resampler = ds.resample(time="M") - with pytest.warns(DeprecationWarning): - actual = resample_reduce(resampler, "mean", engine=engine) - expected = resampler.mean() - xr.testing.assert_allclose(actual, expected) - - with xr.set_options(use_flox=True): - actual = resampler.mean() - xr.testing.assert_allclose(actual, expected) - - -@requires_dask -def test_xarray_resample_dataset_multiple_arrays(engine): - # regression test for #35 - times = pd.date_range("2000", periods=5) - foo = xr.DataArray(range(5), dims=["time"], coords=[times], name="foo") - bar = xr.DataArray(range(1, 6), dims=["time"], coords=[times], name="bar") - ds = xr.merge([foo, bar]).chunk({"time": 4}) - - resampler = ds.resample(time="4D") - # The separate computes are necessary here to force xarray - # to compute all variables in result at the same time. - expected = resampler.mean().compute() - result = resample_reduce(resampler, "mean", engine=engine).compute() - xr.testing.assert_allclose(expected, result) - - @requires_dask @pytest.mark.parametrize( "inchunks, expected", @@ -336,6 +301,8 @@ def test_multi_index_groupby_sum(engine): expected = ds.sum("z") stacked = ds.stack(space=["x", "y"]) actual = xarray_reduce(stacked, "space", dim="z", func="sum", engine=engine) + expected_xarray = stacked.groupby("space").sum("z") + assert_equal(expected_xarray, actual) assert_equal(expected, actual.unstack("space")) actual = xarray_reduce(stacked.foo, "space", dim="z", func="sum", engine=engine) @@ -430,22 +397,9 @@ def test_cache(): assert len(cache.data) == 2 -@pytest.mark.parametrize("use_cftime", [True, False]) -@pytest.mark.parametrize("func", ["count", "mean"]) -def test_datetime_array_reduce(use_cftime, func, engine): - - time = xr.DataArray( - xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime), - dims=("time",), - name="time", - ) - expected = getattr(time.resample(time="YS"), func)() - actual = resample_reduce(time.resample(time="YS"), func=func, engine=engine) - assert_equal(expected, actual) - - @requires_dask -def test_groupby_bins_indexed_coordinate(): +@pytest.mark.parametrize("method", ["cohorts", "map-reduce"]) +def test_groupby_bins_indexed_coordinate(method): ds = ( xr.tutorial.open_dataset("air_temperature") .isel(time=slice(100)) @@ -460,7 +414,17 @@ def test_groupby_bins_indexed_coordinate(): expected_groups=([40, 50, 60, 70],), isbin=(True,), func="mean", - method="split-reduce", + method=method, + ) + xr.testing.assert_allclose(expected, actual) + + actual = xarray_reduce( + ds, + ds.lat, + dim=ds.air.dims, + expected_groups=pd.IntervalIndex.from_breaks([40, 50, 60, 70]), + func="mean", + method=method, ) xr.testing.assert_allclose(expected, actual) @@ -499,6 +463,12 @@ def test_mixed_grouping(chunk): assert (r.sel(v1=[3, 4, 5]) == 0).all().data +def test_alignment_error(): + da = xr.DataArray(np.arange(10), dims="x", coords={"x": np.arange(10)}) + with pytest.raises(ValueError): + xarray_reduce(da, da.x.sel(x=slice(5)), func="count") + + @pytest.mark.parametrize("add_nan", [True, False]) @pytest.mark.parametrize("dtype_out", [np.float64, "float64", np.dtype("float64")]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -570,3 +540,34 @@ def test_dtype_accumulation(use_flox, chunk): assert np.issubdtype(actual.dtype, np.float64) assert np.issubdtype(actual.compute().dtype, np.float64) xr.testing.assert_allclose(expected, actual, **tolerance64) + + +def test_preserve_multiindex(): + """Regression test for GH issue #215""" + + vort = xr.DataArray( + name="vort", + data=np.random.uniform(size=(4, 2)), + dims=["i", "face"], + coords={"i": ("i", np.arange(4)), "face": ("face", np.arange(2))}, + ) + + vort = ( + vort.coarsen(i=2) + .construct(i=("i_region_coarse", "i_region")) + .stack(region=["face", "i_region_coarse"]) + ) + + bins = [np.linspace(0, 1, 10)] + bin_intervals = tuple(pd.IntervalIndex.from_breaks(b) for b in bins) + + hist = xarray_reduce( + xr.DataArray(1), # weights + vort, # variables we want to bin + func="count", # count occurrences falling in bins + expected_groups=bin_intervals, # bins for each variable + dim=["i_region"], # broadcast dimensions + fill_value=0, # fill empty bins with 0 counts + ) + + assert "region" in hist.coords