Skip to content
This repository has been archived by the owner on Oct 7, 2024. It is now read-only.

Commit

Permalink
Sortby (pydata#1389)
Browse files Browse the repository at this point in the history
* First commit of .sort_index() for both dataarray.py and dataset.py.

* Committing changes prior to switching to sortby().

* Finished and passed tests for sortby().

* Adding to doc.

* Revising whats-new.rst

* Fixed some coordinate labeling in tests for clarification.

* Addressed some review comments, and moved doc to reshape.rst.

* Adding lexsort support in test.

* Fixed erroneous code, and the erroneous test to failed to catch the erroneous code.  Also addressed some reviewer comments.

* Adding test for pandas.MultiIndex.  Addressed some review comments.

* Align input args before sort.  Also added a test on pd.MultiIndex.

* Minor addition to docstring.

* Simplified test_dataarray::test_sortby a bit.

* Putting dax back.

* NotImplementedError for < numpy 1.11.0

* Move LooseVersion check into the loop.

* LooseVersion in tests.

* Fix indentation, docstring for dataset.py

* dataarray.py docstring fixup

* Adding to api.rst
  • Loading branch information
chunweiyuan authored and shoyer committed May 12, 2017
1 parent ba6351b commit 80ddad9
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 2 deletions.
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ Reshaping and reorganizing
Dataset.unstack
Dataset.shift
Dataset.roll
Dataset.sortby

DataArray
=========
Expand Down Expand Up @@ -334,6 +335,7 @@ Reshaping and reorganizing
DataArray.unstack
DataArray.shift
DataArray.roll
DataArray.sortby

.. _api.ufuncs:

Expand Down
25 changes: 25 additions & 0 deletions doc/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ labels for one or several dimensions:
array.set_index(x='c', inplace=True)
array.reset_index('x', drop=True)
.. _reshape.shift_and_roll:

Shift and roll
--------------

Expand All @@ -201,3 +203,26 @@ To adjust coordinate labels, you can use the :py:meth:`~xarray.Dataset.shift` an
array = xr.DataArray([1, 2, 3, 4], dims='x')
array.shift(x=2)
array.roll(x=2)
.. _reshape.sort:

Sort
----

One may sort a dataarray/dataset via :py:meth:`~xarray.DataArray.sortby` and
:py:meth:`~xarray.DataArray.sortby`. The input could either be existing
dimensions, or 1-D dataarrays that share dimensions (and have correct dimension
lengths) as the calling object.

.. ipython:: python
ds = Dataset({'A': DataArray([[1, 2], [3, 4]],
[('x', ['b', 'a']),
('y', [1, 0])]),
'B': DataArray([[5, 6], [7, 8]], dims=['x', 'y'])})
ds.sortby('x')
ds.sortby(['y', 'x'])
ds.sortby(['y', 'x'], ascending=False)
dax = DataArray([100, 99], [('x', [0, 1])])
day = DataArray([90, 80], [('y', [0, 1])])
actual = ds.sortby([day, dax])
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ By `Daniel Rothenberg <https://github.com/darothen>`_.

Enhancements
~~~~~~~~~~~~
- New :py:meth:`~xarray.Dataset.sortby` method to ``Dataset`` and ``DataArray``
that enable sorting along dimensions (:issue:`967`).
(see :ref:`reshape.sort`).
By `Chun-Wei Yuan <https://github.com/chunweiyuan>`_ and
`Kyle Heuton <https://github.com/kheuton>`_.

Bug fixes
~~~~~~~~~
Expand Down
35 changes: 35 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,41 @@ def dot(self, other):

return type(self)(new_data, new_coords, new_dims)

def sortby(self, variables, ascending=True):
"""
Sort object by labels or values (along an axis).
Sorts the dataarray, either along specified dimensions,
or according to values of 1-D dataarrays that share dimension
with calling object.
If the input variables are dataarrays, then the dataarrays are aligned
(via left-join) to the calling object prior to sorting by cell values.
NaNs are sorted to the end, following Numpy convention.
If multiple sorts along the same dimension is
given, numpy's lexsort is performed along that dimension:
https://docs.scipy.org/doc/numpy/reference/generated/numpy.lexsort.html
and the FIRST key in the sequence is used as the primary sort key,
followed by the 2nd key, etc.
Parameters
----------
variables: str, DataArray, or list of either
1D DataArray objects or name(s) of 1D variable(s) in
coords whose values are used to sort this array.
ascending: boolean, optional
Whether to sort by ascending or descending order.
Returns
-------
sorted: DataArray
A new dataarray where all the specified dims are sorted by dim
labels.
"""
ds = self._to_temp_dataset().sortby(variables, ascending=ascending)
return self._from_temp_dataset(ds)

def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False):
"""Compute the qth quantile of the data along the specified dimension.
Expand Down
62 changes: 62 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import print_function
import functools
from collections import Mapping, defaultdict
from distutils.version import LooseVersion
from numbers import Number

import sys
Expand Down Expand Up @@ -2742,6 +2743,67 @@ def roll(self, **shifts):

return self._replace_vars_and_dims(variables)

def sortby(self, variables, ascending=True):
"""
Sort object by labels or values (along an axis).
Sorts the dataset, either along specified dimensions,
or according to values of 1-D dataarrays that share dimension
with calling object.
If the input variables are dataarrays, then the dataarrays are aligned
(via left-join) to the calling object prior to sorting by cell values.
NaNs are sorted to the end, following Numpy convention.
If multiple sorts along the same dimension is
given, numpy's lexsort is performed along that dimension:
https://docs.scipy.org/doc/numpy/reference/generated/numpy.lexsort.html
and the FIRST key in the sequence is used as the primary sort key,
followed by the 2nd key, etc.
Parameters
----------
variables: str, DataArray, or list of either
1D DataArray objects or name(s) of 1D variable(s) in
coords/data_vars whose values are used to sort the dataset.
ascending: boolean, optional
Whether to sort by ascending or descending order.
Returns
-------
sorted: Dataset
A new dataset where all the specified dims are sorted by dim
labels.
"""
from .dataarray import DataArray

if not isinstance(variables, list):
variables = [variables]
else:
variables = variables
variables = [v if isinstance(v, DataArray) else self[v]
for v in variables]
aligned_vars = align(self, *variables, join='left')
aligned_self = aligned_vars[0]
aligned_other_vars = aligned_vars[1:]
vars_by_dim = defaultdict(list)
for data_array in aligned_other_vars:
if data_array.ndim != 1:
raise ValueError("Input DataArray is not 1-D.")
if (data_array.dtype == object and
LooseVersion(np.__version__) < LooseVersion('1.11.0')):
raise NotImplementedError(
'sortby uses np.lexsort under the hood, which requires '
'numpy 1.11.0 or later to support object data-type.')
(key,) = data_array.dims
vars_by_dim[key].append(data_array)

indices = {}
for key, arrays in vars_by_dim.items():
order = np.lexsort(tuple(reversed(arrays)))
indices[key] = order if ascending else order[::-1]
return aligned_self.isel(**indices)

def quantile(self, q, dim=None, interpolation='linear',
numeric_only=False, keep_attrs=False):
"""Compute the qth quantile of the data along the specified dimension.
Expand Down
42 changes: 42 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2519,6 +2519,48 @@ def test_combine_first(self):
[('x', ['a', 'b', 'd']), ('y', [-1, 0])])
self.assertDataArrayEqual(actual, expected)

def test_sortby(self):
da = DataArray([[1, 2], [3, 4], [5, 6]],
[('x', ['c', 'b', 'a']), ('y', [1, 0])])

sorted1d = DataArray([[5, 6], [3, 4], [1, 2]],
[('x', ['a', 'b', 'c']), ('y', [1, 0])])

sorted2d = DataArray([[6, 5], [4, 3], [2, 1]],
[('x', ['a', 'b', 'c']), ('y', [0, 1])])

expected = sorted1d
dax = DataArray([100, 99, 98], [('x', ['c', 'b', 'a'])])
actual = da.sortby(dax)
self.assertDatasetEqual(actual, expected)

# test descending order sort
actual = da.sortby(dax, ascending=False)
self.assertDatasetEqual(actual, da)

# test alignment (fills in nan for 'c')
dax_short = DataArray([98, 97], [('x', ['b', 'a'])])
actual = da.sortby(dax_short)
self.assertDatasetEqual(actual, expected)

# test multi-dim sort by 1D dataarray values
expected = sorted2d
dax = DataArray([100, 99, 98], [('x', ['c', 'b', 'a'])])
day = DataArray([90, 80], [('y', [1, 0])])
actual = da.sortby([day, dax])
self.assertDataArrayEqual(actual, expected)

if LooseVersion(np.__version__) < LooseVersion('1.11.0'):
pytest.skip('numpy 1.11.0 or later to support object data-type.')

expected = sorted1d
actual = da.sortby('x')
self.assertDataArrayEqual(actual, expected)

expected = sorted2d
actual = da.sortby(['x', 'y'])
self.assertDataArrayEqual(actual, expected)


@pytest.fixture(params=[1])
def da(request):
Expand Down
93 changes: 91 additions & 2 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3315,12 +3315,101 @@ def test_combine_first(self):

# works just like xr.merge([self, other])
dsy2 = DataArray([2, 2, 2],
[('x', ['b', 'c', 'd'])]).to_dataset(name='dsy2')
[('x', ['b', 'c', 'd'])]).to_dataset(name='dsy2')
actual = dsx0.combine_first(dsy2)
expected = xr.merge([dsy2, dsx0])
self.assertDatasetEqual(actual, expected)

### Py.test tests
def test_sortby(self):
ds = Dataset({'A': DataArray([[1, 2], [3, 4], [5, 6]],
[('x', ['c', 'b', 'a']),
('y', [1, 0])]),
'B': DataArray([[5, 6], [7, 8], [9, 10]],
dims=['x', 'y'])})

sorted1d = Dataset({'A': DataArray([[5, 6], [3, 4], [1, 2]],
[('x', ['a', 'b', 'c']),
('y', [1, 0])]),
'B': DataArray([[9, 10], [7, 8], [5, 6]],
dims=['x', 'y'])})

sorted2d = Dataset({'A': DataArray([[6, 5], [4, 3], [2, 1]],
[('x', ['a', 'b', 'c']),
('y', [0, 1])]),
'B': DataArray([[10, 9], [8, 7], [6, 5]],
dims=['x', 'y'])})

expected = sorted1d
dax = DataArray([100, 99, 98], [('x', ['c', 'b', 'a'])])
actual = ds.sortby(dax)
self.assertDatasetEqual(actual, expected)

# test descending order sort
actual = ds.sortby(dax, ascending=False)
self.assertDatasetEqual(actual, ds)

# test alignment (fills in nan for 'c')
dax_short = DataArray([98, 97], [('x', ['b', 'a'])])
actual = ds.sortby(dax_short)
self.assertDatasetEqual(actual, expected)

# test 1-D lexsort
# dax0 is sorted first to give indices of [1, 2, 0]
# and then dax1 would be used to move index 2 ahead of 1
dax0 = DataArray([100, 95, 95], [('x', ['c', 'b', 'a'])])
dax1 = DataArray([0, 1, 0], [('x', ['c', 'b', 'a'])])
actual = ds.sortby([dax0, dax1]) # lexsort underneath gives [2, 1, 0]
self.assertDatasetEqual(actual, expected)

expected = sorted2d
# test multi-dim sort by 1D dataarray values
day = DataArray([90, 80], [('y', [1, 0])])
actual = ds.sortby([day, dax])
self.assertDatasetEqual(actual, expected)

# test exception-raising
with pytest.raises(KeyError) as excinfo:
actual = ds.sortby('z')

with pytest.raises(ValueError) as excinfo:
actual = ds.sortby(ds['A'])
assert "DataArray is not 1-D" in str(excinfo.value)

if LooseVersion(np.__version__) < LooseVersion('1.11.0'):
pytest.skip('numpy 1.11.0 or later to support object data-type.')

expected = sorted1d
actual = ds.sortby('x')
self.assertDatasetEqual(actual, expected)

# test pandas.MultiIndex
indices = (('b', 1), ('b', 0), ('a', 1), ('a', 0))
midx = pd.MultiIndex.from_tuples(indices, names=['one', 'two'])
ds_midx = Dataset({'A': DataArray([[1, 2], [3, 4], [5, 6], [7, 8]],
[('x', midx), ('y', [1, 0])]),
'B': DataArray([[5, 6], [7, 8], [9, 10], [11, 12]],
dims=['x', 'y'])})
actual = ds_midx.sortby('x')
midx_reversed = pd.MultiIndex.from_tuples(tuple(reversed(indices)),
names=['one', 'two'])
expected = Dataset({'A': DataArray([[7, 8], [5, 6], [3, 4], [1, 2]],
[('x', midx_reversed),
('y', [1, 0])]),
'B': DataArray([[11, 12], [9, 10], [7, 8], [5, 6]],
dims=['x', 'y'])})
self.assertDatasetEqual(actual, expected)

# multi-dim sort by coordinate objects
expected = sorted2d
actual = ds.sortby(['x', 'y'])
self.assertDatasetEqual(actual, expected)

# test descending order sort
actual = ds.sortby(['x', 'y'], ascending=False)
self.assertDatasetEqual(actual, ds)


# Py.test tests


@pytest.fixture()
Expand Down

0 comments on commit 80ddad9

Please sign in to comment.