Skip to content

Commit

Permalink
Implement cudf.MultiIndex.from_arrays (#14740)
Browse files Browse the repository at this point in the history
Implements `cudf.MultiIndex.from_arrays`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #14740
  • Loading branch information
mroeschke authored Jan 18, 2024
1 parent 9acddc0 commit 734ca75
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/index_objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ MultiIndex constructors
.. autosummary::
:toctree: api/

MultiIndex.from_arrays
MultiIndex.from_tuples
MultiIndex.from_product
MultiIndex.from_frame
Expand Down
63 changes: 63 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from cudf.core._compat import PANDAS_GE_150
from cudf.core.frame import Frame
from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
from cudf.utils.dtypes import is_column_like
from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name

Expand Down Expand Up @@ -1226,6 +1227,7 @@ def from_tuples(cls, tuples, names=None):
See Also
--------
MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
Expand Down Expand Up @@ -1335,6 +1337,7 @@ def from_frame(cls, df, names=None):
See Also
--------
MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
Expand Down Expand Up @@ -1429,6 +1432,66 @@ def from_product(cls, arrays, names=None):
pdi = pd.MultiIndex.from_product(arrays, names=names)
return cls.from_pandas(pdi)

@classmethod
@_cudf_nvtx_annotate
def from_arrays(
cls,
arrays,
sortorder=None,
names=None,
) -> MultiIndex:
"""
Convert arrays to MultiIndex.
Parameters
----------
arrays : list / sequence of array-likes
Each array-like gives one level's value for each data point.
len(arrays) is the number of levels.
sortorder : optional int
Not yet supported
names : list / sequence of str, optional
Names for the levels in the index.
Returns
-------
MultiIndex
See Also
--------
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
Examples
--------
>>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
>>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color'))
MultiIndex([(1, 'red'),
(1, 'blue'),
(2, 'red'),
(2, 'blue')],
names=['number', 'color'])
"""
# Imported here due to circular import
from cudf.core.algorithms import factorize

error_msg = "Input must be a list / sequence of array-likes."
if not is_list_like(arrays):
raise TypeError(error_msg)
codes = []
levels = []
for array in arrays:
if not (is_list_like(array) or is_column_like(array)):
raise TypeError(error_msg)
code, level = factorize(array, sort=True)
codes.append(code)
levels.append(level)
return cls(
codes=codes, levels=levels, sortorder=sortorder, names=names
)

@_cudf_nvtx_annotate
def _poplevels(self, level):
"""
Expand Down
38 changes: 30 additions & 8 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

"""
Test related to MultiIndex
Expand Down Expand Up @@ -2085,12 +2085,7 @@ def test_multiindex_eq_other_multiindex():
params=[
"from_product",
"from_tuples",
pytest.param(
"from_arrays",
marks=pytest.mark.xfail(
reason="TODO: from_arrays is not implemented"
),
),
"from_arrays",
"init",
]
)
Expand All @@ -2100,7 +2095,7 @@ def midx(request):
elif request.param == "from_tuples":
return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
elif request.param == "from_arrays":
return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]])
elif request.param == "init":
return cudf.MultiIndex(
levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
Expand All @@ -2112,3 +2107,30 @@ def midx(request):
def test_multindex_constructor_levels_always_indexes(midx):
assert_eq(midx.levels[0], cudf.Index([0, 1]))
assert_eq(midx.levels[1], cudf.Index([0, 1]))


@pytest.mark.parametrize(
"array",
[
list,
tuple,
np.array,
cp.array,
pd.Index,
cudf.Index,
pd.Series,
cudf.Series,
],
)
def test_multiindex_from_arrays(array):
pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]]
cudf_data = [array(lst) for lst in pd_data]
result = pd.MultiIndex.from_arrays(pd_data)
expected = cudf.MultiIndex.from_arrays(cudf_data)
assert_eq(result, expected)


@pytest.mark.parametrize("arg", ["foo", ["foo"]])
def test_multiindex_from_arrays_wrong_arg(arg):
with pytest.raises(TypeError):
cudf.MultiIndex.from_arrays(arg)

0 comments on commit 734ca75

Please sign in to comment.