From 5a3123ce39e9ac66f65df7370ac311dcf2c93f5f Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 7 Apr 2021 13:56:58 -0700 Subject: [PATCH 01/11] . --- python/cudf/cudf/core/series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 55fd510f03a..2e48e507416 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1184,6 +1184,7 @@ def map(self, arg, na_action=None) -> "Series": def __getitem__(self, arg): if isinstance(arg, slice): return self.iloc[arg] + # else: return self.loc[arg] From 6bb5451af189b696ffbb1740c06e51e32c8b89f7 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 9 Apr 2021 10:44:10 -0700 Subject: [PATCH 02/11] now fixed - passing test --- python/cudf/cudf/core/indexing.py | 17 +++++++++++------ python/cudf/cudf/core/series.py | 1 - python/cudf/cudf/tests/test_series.py | 9 +++++++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index aec931fefbf..45c4675c53c 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -168,13 +168,18 @@ def _loc_to_iloc(self, arg): from cudf.core.series import Series if is_scalar(arg): - try: - found_index = self._sr.index._values.find_first_value( - arg, closest=False - ) + if isinstance(arg, int): + found_index = arg return found_index - except (TypeError, KeyError, IndexError, ValueError): - raise KeyError("label scalar is out of bound") + + if isinstance(arg, str): + try: + found_index = self._sr.index._values.find_first_value( + arg, closest=False + ) + return found_index + except (TypeError, KeyError, IndexError, ValueError): + raise KeyError("label scalar is out of bound") elif isinstance(arg, slice): return get_label_range_or_mask( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2e48e507416..55fd510f03a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1184,7 +1184,6 @@ def map(self, arg, na_action=None) -> "Series": def __getitem__(self, arg): if isinstance(arg, slice): return self.iloc[arg] - # else: return self.loc[arg] diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 0dc53fa29e9..7c742afa69f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1196,3 +1196,12 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) else: assert_eq(expect, got, check_dtype=False) + +def test_series_indexing(): + ps = pd.Series([1,2,3], index=pd.Index(["a", "b", "c"])) + gs = cudf.from_pandas(ps) + + expect = ps[1] + got = gs[1] + + assert_eq(expect, got) From 2008ad6873ea9ae51e8a1103ec2a585449962036 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 9 Apr 2021 18:15:42 -0700 Subject: [PATCH 03/11] addressed review comments. --- python/cudf/cudf/core/indexing.py | 13 ++++++++----- python/cudf/cudf/tests/test_series.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 45c4675c53c..65c96d79628 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -10,7 +10,9 @@ from cudf._lib.concat import concat_columns from cudf._lib.scalar import _is_null_host_scalar from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike +from cudf.core.column import column from cudf.core.column.column import as_column +from cudf.core.series import Series from cudf.utils.dtypes import ( find_common_type, is_categorical_dtype, @@ -164,15 +166,16 @@ def __setitem__(self, key, value): self._sr.iloc[key] = value def _loc_to_iloc(self, arg): - from cudf.core.column import column - from cudf.core.series import Series - if is_scalar(arg): if isinstance(arg, int): found_index = arg return found_index - - if isinstance(arg, str): + + if isinstance(arg, cudf.Scalar): + found_index = arg.value + return found_index + + else: try: found_index = self._sr.index._values.find_first_value( arg, closest=False diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 7c742afa69f..f23ba06b84f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1197,11 +1197,15 @@ def test_explode(data, ignore_index, p_index): else: assert_eq(expect, got, check_dtype=False) -def test_series_indexing(): - ps = pd.Series([1,2,3], index=pd.Index(["a", "b", "c"])) + +@pytest.mark.parametrize( + "arg", [1, cudf.Scalar(1, dtype="int32")], +) +def test_series_indexing(arg): + ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) - expect = ps[1] - got = gs[1] + expect = ps[arg] + got = gs[arg] assert_eq(expect, got) From 59fe5b3b1cd2d82c297936d0ad7d423d7431ac39 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 12 Apr 2021 16:23:19 -0700 Subject: [PATCH 04/11] trying to fix build issues in notebooks --- python/cudf/cudf/core/indexing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 65c96d79628..351f6d30530 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -10,9 +10,7 @@ from cudf._lib.concat import concat_columns from cudf._lib.scalar import _is_null_host_scalar from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike -from cudf.core.column import column from cudf.core.column.column import as_column -from cudf.core.series import Series from cudf.utils.dtypes import ( find_common_type, is_categorical_dtype, @@ -195,7 +193,7 @@ def _loc_to_iloc(self, arg): return indices_from_labels(self._sr, arg) else: - arg = Series(column.as_column(arg)) + arg = cudf.core.series.Series(cudf.core.column.as_column(arg)) if arg.dtype in (bool, np.bool_): return arg else: From b5615c22da1921c1ef043b3356f5f073b292ac25 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 12 Apr 2021 18:32:13 -0700 Subject: [PATCH 05/11] . --- python/cudf/cudf/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 351f6d30530..4d8c2389089 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -169,7 +169,7 @@ def _loc_to_iloc(self, arg): found_index = arg return found_index - if isinstance(arg, cudf.Scalar): + elif isinstance(arg, cudf.Scalar): found_index = arg.value return found_index From 577e40fa863b339aeaa505364379eab3e7c94d31 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 13 Apr 2021 21:52:35 -0700 Subject: [PATCH 06/11] added more test cases. ready for review --- python/cudf/cudf/core/indexing.py | 2 +- python/cudf/cudf/tests/test_series.py | 37 ++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 4d8c2389089..54abe3d30bc 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -165,7 +165,7 @@ def __setitem__(self, key, value): def _loc_to_iloc(self, arg): if is_scalar(arg): - if isinstance(arg, int): + if pd.api.types.is_integer(arg): found_index = arg return found_index diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f23ba06b84f..3f6569101b0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -762,8 +762,8 @@ def test_series_memory_usage(): sliced_sr = sr[2:] assert sliced_sr.memory_usage() == 16 - sliced_sr[3] = None - assert sliced_sr.memory_usage() == 80 + # sliced_sr[3] = None + # assert sliced_sr.memory_usage() == 80 sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) assert sr.memory_usage() == 44 @@ -1199,7 +1199,20 @@ def test_explode(data, ignore_index, p_index): @pytest.mark.parametrize( - "arg", [1, cudf.Scalar(1, dtype="int32")], + "arg", + [ + 1, + -1, + "b", + np.int32(1), + np.uint32(1), + np.int8(1), + np.uint8(1), + np.int16(1), + np.uint16(1), + np.int64(1), + np.uint64(1), + ], ) def test_series_indexing(arg): ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) @@ -1209,3 +1222,21 @@ def test_series_indexing(arg): got = gs[arg] assert_eq(expect, got) + + +@pytest.mark.parametrize( + "dtype", + ["int32", "int16", "int8", "int64", "uint32", "uint16", "uint8", "uint64"], +) +def test_series_indexing_cudf_Scalar(dtype): + ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) + gs = cudf.from_pandas(ps) + + arg = cudf.Scalar(1, dtype=dtype) + got = gs[arg] + + with pytest.raises( + TypeError, match=re.escape(f"'{arg}' is an invalid key"), + ): + expect = ps[arg] + assert_eq(expect, got) From 2868a9a6712a6ce33129bb6c3c1f49fbad915dfe Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 14 Apr 2021 16:06:46 -0700 Subject: [PATCH 07/11] fixes for non-numerical Index dtype --- python/cudf/cudf/core/indexing.py | 31 +++++++++------- python/cudf/cudf/tests/test_indexing.py | 49 ++++++++++++++++++++++++- python/cudf/cudf/tests/test_series.py | 48 +----------------------- 3 files changed, 67 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 54abe3d30bc..de989630914 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -16,6 +16,7 @@ is_categorical_dtype, is_column_like, is_list_like, + is_numerical_dtype, is_scalar, to_cudf_compatible_scalar, ) @@ -165,22 +166,24 @@ def __setitem__(self, key, value): def _loc_to_iloc(self, arg): if is_scalar(arg): - if pd.api.types.is_integer(arg): - found_index = arg - return found_index - - elif isinstance(arg, cudf.Scalar): - found_index = arg.value - return found_index + if not is_numerical_dtype(self._sr.index.dtype): + if pd.api.types.is_integer(arg): + found_index = arg + return found_index - else: - try: - found_index = self._sr.index._values.find_first_value( - arg, closest=False - ) + elif isinstance( + arg, cudf.Scalar + ) and pd.api.types.is_integer_dtype(arg.dtype): + found_index = arg.value return found_index - except (TypeError, KeyError, IndexError, ValueError): - raise KeyError("label scalar is out of bound") + + try: + found_index = self._sr.index._values.find_first_value( + arg, closest=False + ) + return found_index + except (TypeError, KeyError, IndexError, ValueError): + raise KeyError("label scalar is out of bound") elif isinstance(arg, slice): return get_label_range_or_mask( diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 086d59ab0f2..c64ab1358e1 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,5 +1,6 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +import re from itertools import combinations import cupy @@ -99,16 +100,35 @@ def pdf_gdf_multi(): + ["numpy.array[%s]" % np.dtype(t).type.__name__ for t in index_dtypes] ), ) -def test_series_indexing(i1, i2, i3): +@pytest.mark.parametrize( + "arg", + [ + 1, + -1, + "b", + np.int32(1), + np.uint32(1), + np.int8(1), + np.uint8(1), + np.int16(1), + np.uint16(1), + np.int64(1), + np.uint64(1), + ], +) +def test_series_indexing(i1, i2, i3, arg): a1 = np.arange(20) series = cudf.Series(a1) + # Indexing sr1 = series.iloc[i1] assert sr1.null_count == 0 np.testing.assert_equal(sr1.to_array(), a1[:12]) + sr2 = sr1.iloc[i2] assert sr2.null_count == 0 np.testing.assert_equal(sr2.to_array(), a1[3:12]) + # Index with stride sr3 = sr2.iloc[i3] assert sr3.null_count == 0 @@ -122,6 +142,33 @@ def test_series_indexing(i1, i2, i3): for i in i1: # numpy integers assert series[i] == a1[i] + # Indexing for non-integer dtype Index + ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) + gs = cudf.from_pandas(ps) + + expect = ps[arg] + got = gs[arg] + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "dtype", + ["int32", "int16", "int8", "int64", "uint32", "uint16", "uint8", "uint64"], +) +def test_series_indexing_cudf_Scalar(dtype): + ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) + gs = cudf.from_pandas(ps) + + arg = cudf.Scalar(1, dtype=dtype) + got = gs[arg] + + with pytest.raises( + TypeError, match=re.escape(f"'{arg}' is an invalid key"), + ): + expect = ps[arg] + assert_eq(expect, got) + def test_series_indexing_large_size(): n_elem = 100_000 diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 3f6569101b0..0dc53fa29e9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -762,8 +762,8 @@ def test_series_memory_usage(): sliced_sr = sr[2:] assert sliced_sr.memory_usage() == 16 - # sliced_sr[3] = None - # assert sliced_sr.memory_usage() == 80 + sliced_sr[3] = None + assert sliced_sr.memory_usage() == 80 sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) assert sr.memory_usage() == 44 @@ -1196,47 +1196,3 @@ def test_explode(data, ignore_index, p_index): assert_eq(expect, got, check_dtype=False) else: assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "arg", - [ - 1, - -1, - "b", - np.int32(1), - np.uint32(1), - np.int8(1), - np.uint8(1), - np.int16(1), - np.uint16(1), - np.int64(1), - np.uint64(1), - ], -) -def test_series_indexing(arg): - ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) - gs = cudf.from_pandas(ps) - - expect = ps[arg] - got = gs[arg] - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtype", - ["int32", "int16", "int8", "int64", "uint32", "uint16", "uint8", "uint64"], -) -def test_series_indexing_cudf_Scalar(dtype): - ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) - gs = cudf.from_pandas(ps) - - arg = cudf.Scalar(1, dtype=dtype) - got = gs[arg] - - with pytest.raises( - TypeError, match=re.escape(f"'{arg}' is an invalid key"), - ): - expect = ps[arg] - assert_eq(expect, got) From 6a6a33335d11b859d2134f74147854e4d580d11a Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 14 Apr 2021 17:54:22 -0700 Subject: [PATCH 08/11] minor fixes --- python/cudf/cudf/tests/test_indexing.py | 40 +++++++++++++------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index c64ab1358e1..0367c8ba5e3 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -100,23 +100,7 @@ def pdf_gdf_multi(): + ["numpy.array[%s]" % np.dtype(t).type.__name__ for t in index_dtypes] ), ) -@pytest.mark.parametrize( - "arg", - [ - 1, - -1, - "b", - np.int32(1), - np.uint32(1), - np.int8(1), - np.uint8(1), - np.int16(1), - np.uint16(1), - np.int64(1), - np.uint64(1), - ], -) -def test_series_indexing(i1, i2, i3, arg): +def test_series_indexing(i1, i2, i3): a1 = np.arange(20) series = cudf.Series(a1) @@ -142,7 +126,25 @@ def test_series_indexing(i1, i2, i3, arg): for i in i1: # numpy integers assert series[i] == a1[i] - # Indexing for non-integer dtype Index + +@pytest.mark.parametrize( + "arg", + [ + 1, + -1, + "b", + np.int32(1), + np.uint32(1), + np.int8(1), + np.uint8(1), + np.int16(1), + np.uint16(1), + np.int64(1), + np.uint64(1), + ], +) +def test_series_get_item_iloc_defer(arg): + # Indexing for non-numeric dtype Index ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) @@ -156,7 +158,7 @@ def test_series_indexing(i1, i2, i3, arg): "dtype", ["int32", "int16", "int8", "int64", "uint32", "uint16", "uint8", "uint64"], ) -def test_series_indexing_cudf_Scalar(dtype): +def test_series_iloc_defer_cudf_Scalar(dtype): ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) From 8e9e6c15a87b0631aa3906da3f9b7a7059ecc379 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 14 Apr 2021 20:20:01 -0700 Subject: [PATCH 09/11] . --- python/cudf/cudf/core/indexing.py | 3 ++- python/cudf/cudf/tests/test_indexing.py | 18 +++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index de989630914..ae094d506b9 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -167,7 +167,8 @@ def __setitem__(self, key, value): def _loc_to_iloc(self, arg): if is_scalar(arg): if not is_numerical_dtype(self._sr.index.dtype): - if pd.api.types.is_integer(arg): + # TODO: switch to cudf.utils.dtypes.is_integer(arg) + if pd.api.types.is_integer(arg): # found_index = arg return found_index diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 0367c8ba5e3..048e76ce009 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,6 +1,5 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import re from itertools import combinations import cupy @@ -154,21 +153,14 @@ def test_series_get_item_iloc_defer(arg): assert_eq(expect, got) -@pytest.mark.parametrize( - "dtype", - ["int32", "int16", "int8", "int64", "uint32", "uint16", "uint8", "uint64"], -) -def test_series_iloc_defer_cudf_Scalar(dtype): +def test_series_iloc_defer_cudf_scalar(): ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) gs = cudf.from_pandas(ps) - arg = cudf.Scalar(1, dtype=dtype) - got = gs[arg] - - with pytest.raises( - TypeError, match=re.escape(f"'{arg}' is an invalid key"), - ): - expect = ps[arg] + for t in index_dtypes: + arg = cudf.Scalar(1, dtype=t) + got = gs[arg] + expect = 2 assert_eq(expect, got) From 3afe7703e318ccda30a3834d055841f9e2e5b0b2 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 15 Apr 2021 12:25:48 -0700 Subject: [PATCH 10/11] . --- python/cudf/cudf/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index ae094d506b9..b9cadce594f 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -172,9 +172,9 @@ def _loc_to_iloc(self, arg): found_index = arg return found_index - elif isinstance( + elif pd.api.types.is_integer_dtype(arg.dtype) and isinstance( arg, cudf.Scalar - ) and pd.api.types.is_integer_dtype(arg.dtype): + ): found_index = arg.value return found_index From f00bb2fd7473fb331378c947b87a2d20044eb629 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Fri, 16 Apr 2021 09:16:34 -0700 Subject: [PATCH 11/11] ready for review --- python/cudf/cudf/core/indexing.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index b9cadce594f..a4e15890cbe 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -168,16 +168,14 @@ def _loc_to_iloc(self, arg): if is_scalar(arg): if not is_numerical_dtype(self._sr.index.dtype): # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if pd.api.types.is_integer(arg): # - found_index = arg - return found_index - - elif pd.api.types.is_integer_dtype(arg.dtype) and isinstance( + if isinstance( arg, cudf.Scalar - ): + ) and pd.api.types.is_integer_dtype(arg.dtype): found_index = arg.value return found_index - + elif pd.api.types.is_integer(arg): + found_index = arg + return found_index try: found_index = self._sr.index._values.find_first_value( arg, closest=False