From 734ca757bf43f76922f266ea3bb2cb67372374ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 07:19:59 -1000 Subject: [PATCH] Implement `cudf.MultiIndex.from_arrays` (#14740) Implements `cudf.MultiIndex.from_arrays` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/14740 --- .../user_guide/api_docs/index_objects.rst | 1 + python/cudf/cudf/core/multiindex.py | 63 +++++++++++++++++++ python/cudf/cudf/tests/test_multiindex.py | 38 ++++++++--- 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst index 013eaf29a56..b6da9af9b3e 100644 --- a/docs/cudf/source/user_guide/api_docs/index_objects.rst +++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst @@ -228,6 +228,7 @@ MultiIndex constructors .. autosummary:: :toctree: api/ + MultiIndex.from_arrays MultiIndex.from_tuples MultiIndex.from_product MultiIndex.from_frame diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0f323dd5540..8ba47795437 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -27,6 +27,7 @@ from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index +from cudf.utils.dtypes import is_column_like from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -1226,6 +1227,7 @@ def from_tuples(cls, tuples, names=None): See Also -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. MultiIndex.from_frame : Make a MultiIndex from a DataFrame. @@ -1335,6 +1337,7 @@ def from_frame(cls, df, names=None): See Also -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. @@ -1429,6 +1432,66 @@ def from_product(cls, arrays, names=None): pdi = pd.MultiIndex.from_product(arrays, names=names) return cls.from_pandas(pdi) + @classmethod + @_cudf_nvtx_annotate + def from_arrays( + cls, + arrays, + sortorder=None, + names=None, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : optional int + Not yet supported + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + # Imported here due to circular import + from cudf.core.algorithms import factorize + + error_msg = "Input must be a list / sequence of array-likes." + if not is_list_like(arrays): + raise TypeError(error_msg) + codes = [] + levels = [] + for array in arrays: + if not (is_list_like(array) or is_column_like(array)): + raise TypeError(error_msg) + code, level = factorize(array, sort=True) + codes.append(code) + levels.append(level) + return cls( + codes=codes, levels=levels, sortorder=sortorder, names=names + ) + @_cudf_nvtx_annotate def _poplevels(self, level): """ diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 2d5a4d1d782..78bce89f2a8 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. """ Test related to MultiIndex @@ -2085,12 +2085,7 @@ def test_multiindex_eq_other_multiindex(): params=[ "from_product", "from_tuples", - pytest.param( - "from_arrays", - marks=pytest.mark.xfail( - reason="TODO: from_arrays is not implemented" - ), - ), + "from_arrays", "init", ] ) @@ -2100,7 +2095,7 @@ def midx(request): elif request.param == "from_tuples": return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)]) elif request.param == "from_arrays": - return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0]) + return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]]) elif request.param == "init": return cudf.MultiIndex( levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]] @@ -2112,3 +2107,30 @@ def midx(request): def test_multindex_constructor_levels_always_indexes(midx): assert_eq(midx.levels[0], cudf.Index([0, 1])) assert_eq(midx.levels[1], cudf.Index([0, 1])) + + +@pytest.mark.parametrize( + "array", + [ + list, + tuple, + np.array, + cp.array, + pd.Index, + cudf.Index, + pd.Series, + cudf.Series, + ], +) +def test_multiindex_from_arrays(array): + pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]] + cudf_data = [array(lst) for lst in pd_data] + result = pd.MultiIndex.from_arrays(pd_data) + expected = cudf.MultiIndex.from_arrays(cudf_data) + assert_eq(result, expected) + + +@pytest.mark.parametrize("arg", ["foo", ["foo"]]) +def test_multiindex_from_arrays_wrong_arg(arg): + with pytest.raises(TypeError): + cudf.MultiIndex.from_arrays(arg)