From 7b59669e378877db2cf3e9f9f914b97d4c5bf21f Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Mon, 12 Aug 2019 09:33:55 -0700 Subject: [PATCH 01/64] add basic tolist() implementation --- python/cudf/cudf/dataframe/series.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 7f10bfce3a8..8985880ca68 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -444,6 +444,22 @@ def values_to_string(self, nrows=None): out = ["" if v is None else str(v) for v in values] return out + def tolist(self): + """ + Return a list type from series data. + + Returns + ------- + list + + Difference from pandas: + * Returns numpy.datetime64 types vs pandas datetype + """ + if isinstance(self.values, list): + return self.values + else: + return self.values.tolist() + def head(self, n=5): return self.iloc[:n] From 360bea95eae796579ac3275a85dad9afffd49ba4 Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Mon, 12 Aug 2019 09:34:35 -0700 Subject: [PATCH 02/64] add basic tolist() test --- python/cudf/cudf/tests/test_dataframe.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 301cc8d488a..6d40c257942 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3894,3 +3894,28 @@ def test_isin_index(data, values): expected = psr.index.isin(values) assert_eq(got.data.mem.copy_to_host(), expected) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5], + [1.0, 2.0, 3.0, 4.0, 5.0], + ["a", "b", "c", "d", "e"], + pytest.param( + np.array( + ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], + dtype=np.datetime64, + ), + marks=pytest.mark.xfail, + ), + ], +) +def test_tolist(data): + psr = pd.Series(data) + gsr = Series.from_pandas(psr) + + got = gsr.tolist() + expected = psr.tolist() + + assert got == expected From 073802a582f88d048cee14cf53d1bb4028b567b7 Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 09:36:50 -0700 Subject: [PATCH 03/64] resolve merge conflict --- python/cudf/cudf/tests/test_dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f84b74bfddc..8eeed509753 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3291,7 +3291,6 @@ def test_isin_index(data, values): assert_eq(got.data.mem.copy_to_host(), expected) -<<<<<<< HEAD def test_constructor_properties(): df = DataFrame() key1 = "a" @@ -3320,7 +3319,8 @@ def test_constructor_properties(): # Inorrect use of _constructor_expanddim (Raises for DataFrame) with pytest.raises(NotImplementedError): df._constructor_expanddim -======= + + @pytest.mark.parametrize( "data", [ @@ -3344,4 +3344,3 @@ def test_tolist(data): expected = psr.tolist() assert got == expected ->>>>>>> add-tolist From fe004114e4e2334397fc84763dbe56515d72c8bd Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 09:42:21 -0700 Subject: [PATCH 04/64] tolist handles np.datetime64 consistently --- python/cudf/cudf/dataframe/series.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 038edd673d7..dfc189bfcae 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -449,8 +449,11 @@ def tolist(self): if isinstance(self.values, list): return self.values else: - return self.values.tolist() - + if np.issubdtype(self.values.dtype, np.datetime64): + return list(self.values.astype(np.datetime64)) + else: + return self.values.tolist() + def head(self, n=5): return self.iloc[:n] From ddb3943aea99dccb57d82d20c9e849bad27c3c26 Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 09:43:03 -0700 Subject: [PATCH 05/64] tolist tests np.datetime64 consistently --- python/cudf/cudf/tests/test_dataframe.py | 43 +++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8eeed509753..e8079465e5e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3325,15 +3325,11 @@ def test_constructor_properties(): "data", [ [1, 2, 3, 4, 5], + [1, 2, None, 4, 5], [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 2.0, None, 4.0, 5.0], ["a", "b", "c", "d", "e"], - pytest.param( - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], - dtype=np.datetime64, - ), - marks=pytest.mark.xfail, - ), + ["a", "b", None, "d", "e"], ], ) def test_tolist(data): @@ -3343,4 +3339,35 @@ def test_tolist(data): got = gsr.tolist() expected = psr.tolist() - assert got == expected + np.testing.assert_array_equal(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + np.array( + ["1991-11-20", "2004-12-04"], + dtype=np.datetime64, + ), + np.array( + ["1991-11-20", None], + dtype=np.datetime64, + ), + np.array( + ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], + dtype=np.datetime64, + ), + np.array( + ["1991-11-20 05:15:00", None], + dtype=np.datetime64, + ), + ], +) +def test_tolist_datetime(data): + psr = pd.Series(data) + gsr = Series.from_pandas(psr) + + got = gsr.tolist() + expected = [d.to_datetime64() for d in psr.tolist()] + + np.testing.assert_array_equal(got, expected) From 3ba6987d6a3b65fb5b08f1b128723415b929ae2e Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 09:54:27 -0700 Subject: [PATCH 06/64] restructure if/else slightly --- python/cudf/cudf/dataframe/series.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index dfc189bfcae..e79414ac5d6 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -444,15 +444,14 @@ def tolist(self): list Difference from pandas: - * Returns numpy.datetime64 types vs pandas datetype + * Returns numpy.datetime64 instead of pandas internal type """ if isinstance(self.values, list): return self.values - else: - if np.issubdtype(self.values.dtype, np.datetime64): - return list(self.values.astype(np.datetime64)) - else: - return self.values.tolist() + elif np.issubdtype(self.values.dtype, np.datetime64): + return list(self.values.astype(np.datetime64)) + else: + return self.values.tolist() def head(self, n=5): return self.iloc[:n] From 86a74994cfecab319a0b0f0f2e769127eb45cdc4 Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 09:56:36 -0700 Subject: [PATCH 07/64] style changes --- python/cudf/cudf/dataframe/series.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 20 +++++--------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index e79414ac5d6..83b3fd39ba4 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -450,9 +450,9 @@ def tolist(self): return self.values elif np.issubdtype(self.values.dtype, np.datetime64): return list(self.values.astype(np.datetime64)) - else: + else: return self.values.tolist() - + def head(self, n=5): return self.iloc[:n] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e8079465e5e..4d37bc62536 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3320,7 +3320,7 @@ def test_constructor_properties(): with pytest.raises(NotImplementedError): df._constructor_expanddim - + @pytest.mark.parametrize( "data", [ @@ -3345,22 +3345,12 @@ def test_tolist(data): @pytest.mark.parametrize( "data", [ + np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), + np.array(["1991-11-20", None], dtype=np.datetime64), np.array( - ["1991-11-20", "2004-12-04"], - dtype=np.datetime64, - ), - np.array( - ["1991-11-20", None], - dtype=np.datetime64, - ), - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], - dtype=np.datetime64, - ), - np.array( - ["1991-11-20 05:15:00", None], - dtype=np.datetime64, + ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 ), + np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), ], ) def test_tolist_datetime(data): From 9bf508f494a4aaeacf89f891acdc6d774f22134b Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 12:44:55 -0700 Subject: [PATCH 08/64] tolist reads data cast type from .values --- python/cudf/cudf/dataframe/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 83b3fd39ba4..91cdb8f0bc1 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -449,7 +449,7 @@ def tolist(self): if isinstance(self.values, list): return self.values elif np.issubdtype(self.values.dtype, np.datetime64): - return list(self.values.astype(np.datetime64)) + return list(self.values.astype(self.values.dtype)) else: return self.values.tolist() From 382cf0ca1a61dab366ec475e70d5780782a4bc3d Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Tue, 13 Aug 2019 13:00:07 -0700 Subject: [PATCH 09/64] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0822bde0c46..02857ee08e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #2522 Add Java bindings for NVStrings backed upper and lower case mutators +- PR #2559 Add Series.tolist() ## Improvements From e5e0a67c460e95fb088e00de0ecd0ab8e521db65 Mon Sep 17 00:00:00 2001 From: Brandon Miller Date: Wed, 14 Aug 2019 14:41:23 -0700 Subject: [PATCH 10/64] pull out values first to not recompute --- python/cudf/cudf/dataframe/series.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 91cdb8f0bc1..b4018391f9e 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -446,12 +446,13 @@ def tolist(self): Difference from pandas: * Returns numpy.datetime64 instead of pandas internal type """ - if isinstance(self.values, list): - return self.values - elif np.issubdtype(self.values.dtype, np.datetime64): - return list(self.values.astype(self.values.dtype)) + vals = self.values + if isinstance(vals, list): + return vals + elif np.issubdtype(vals.dtype, np.datetime64): + return list(vals.astype(vals.dtype)) else: - return self.values.tolist() + return vals.tolist() def head(self, n=5): return self.iloc[:n] From 237a52b8f9f4be5d68b02ee282a6105ad7f10150 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Aug 2019 08:52:22 -0700 Subject: [PATCH 11/64] added the test case for all nulls --- python/cudf/cudf/tests/test_dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4d37bc62536..71ee78715a7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3330,6 +3330,7 @@ def test_constructor_properties(): [1.0, 2.0, None, 4.0, 5.0], ["a", "b", "c", "d", "e"], ["a", "b", None, "d", "e"], + [None, None, None, None, None] ], ) def test_tolist(data): From 2dc06223234d7139ed8c5df207a791e91505dfb0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 19 Aug 2019 10:54:58 -0700 Subject: [PATCH 12/64] remove as_datetime64, consolidate tests b.c. pyarrow --- python/cudf/cudf/tests/test_dataframe.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 71ee78715a7..ba32a42c280 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3330,22 +3330,7 @@ def test_constructor_properties(): [1.0, 2.0, None, 4.0, 5.0], ["a", "b", "c", "d", "e"], ["a", "b", None, "d", "e"], - [None, None, None, None, None] - ], -) -def test_tolist(data): - psr = pd.Series(data) - gsr = Series.from_pandas(psr) - - got = gsr.tolist() - expected = psr.tolist() - - np.testing.assert_array_equal(got, expected) - - -@pytest.mark.parametrize( - "data", - [ + [None, None, None, None, None], np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), np.array(["1991-11-20", None], dtype=np.datetime64), np.array( @@ -3354,11 +3339,11 @@ def test_tolist(data): np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), ], ) -def test_tolist_datetime(data): +def test_tolist(data): psr = pd.Series(data) gsr = Series.from_pandas(psr) got = gsr.tolist() - expected = [d.to_datetime64() for d in psr.tolist()] + expected = psr.tolist() - np.testing.assert_array_equal(got, expected) + np.testing.assert_array_equal(got, expected) \ No newline at end of file From bab88df32bd48901a396779eb5aff6234fb9f8ae Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 20 Aug 2019 05:52:11 -0700 Subject: [PATCH 13/64] adjust null checking in test re:pyarrow --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ba32a42c280..9186b280b08 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3344,6 +3344,6 @@ def test_tolist(data): gsr = Series.from_pandas(psr) got = gsr.tolist() - expected = psr.tolist() + expected = [x if not pd.isnull(x) else None for x in psr.tolist()] np.testing.assert_array_equal(got, expected) \ No newline at end of file From f3b642df79f650b083401e4399383336707904aa Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 20 Aug 2019 05:52:55 -0700 Subject: [PATCH 14/64] move to list through pyarrow instead --- python/cudf/cudf/dataframe/series.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index b4018391f9e..9416bb7d7a3 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -442,17 +442,11 @@ def tolist(self): Returns ------- list - - Difference from pandas: - * Returns numpy.datetime64 instead of pandas internal type """ - vals = self.values - if isinstance(vals, list): - return vals - elif np.issubdtype(vals.dtype, np.datetime64): - return list(vals.astype(vals.dtype)) - else: - return vals.tolist() + return self.to_arrow().to_pylist() + + + def head(self, n=5): return self.iloc[:n] From a6f2835ad799217ab4f9c31cb7eb2383bf7061c6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 20 Aug 2019 05:54:41 -0700 Subject: [PATCH 15/64] style changes --- python/cudf/cudf/dataframe/series.py | 3 --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 9416bb7d7a3..de3b7127e8d 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -444,9 +444,6 @@ def tolist(self): list """ return self.to_arrow().to_pylist() - - - def head(self, n=5): return self.iloc[:n] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9186b280b08..785f06b6679 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3346,4 +3346,4 @@ def test_tolist(data): got = gsr.tolist() expected = [x if not pd.isnull(x) else None for x in psr.tolist()] - np.testing.assert_array_equal(got, expected) \ No newline at end of file + np.testing.assert_array_equal(got, expected) From 443682f058dcb0e4dc57194ec2dfde104f9c1c93 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Thu, 22 Aug 2019 12:57:52 -0500 Subject: [PATCH 16/64] contains for Numerical --- python/cudf/cudf/dataframe/numerical.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 6ac84e47965..c22254ffb20 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -49,11 +49,8 @@ def __contains__(self, item): Returns True if column contains item, else False. """ item_found = False - try: - if self.find_first_value(item): - item_found = True - except ValueError: - """This means value not found""" + if cudautils.find_first(self.data.mem, item) != -1: + item_found = True return item_found @@ -90,10 +87,10 @@ def binary_operator(self, binop, rhs, reflect=False): if isinstance(rhs, NumericalColumn) or np.isscalar(rhs): out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: - if ( + if (tmp.dtype in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) - ) and (tmp.dtype in int_dtypes): + ): out_dtype = np.dtype("float_") return numeric_column_binop( lhs=self, From 5a1963042b3443420accf01bd2d10840865de954 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 22 Aug 2019 12:51:21 -0700 Subject: [PATCH 17/64] added mixed NoneType/NaN/NaT test --- python/cudf/cudf/tests/test_dataframe.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 785f06b6679..629dcc4dc66 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3347,3 +3347,28 @@ def test_tolist(data): expected = [x if not pd.isnull(x) else None for x in psr.tolist()] np.testing.assert_array_equal(got, expected) + + +def test_tolist_mixed_nulls(): + num_data = pa.array([1.0, None, np.float64("nan")]) + num_data_expect = [1.0, None, np.float64("nan")] + + time_data = pa.array( + [1, None, -9223372036854775808], type=pa.timestamp("ns") + ) + time_data_expect = [ + pd.Timestamp("1970-01-01T00:00:00.000000001"), + None, + pd.NaT, + ] + + df = DataFrame() + df["num_data"] = num_data + df["time_data"] = time_data + + num_data_got = df["num_data"].tolist() + time_data_got = df["time_data"].tolist() + + np.testing.assert_equal(num_data_got, num_data_expect) + for got, exp in zip(time_data_got, time_data_expect): # deal with NaT + assert (got == exp) or (pd.isnull(got) and pd.isnull(exp)) From 8ce9fa1dab4e32cc8d1d890bea84d42fb23c6e62 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Thu, 22 Aug 2019 17:20:35 -0500 Subject: [PATCH 18/64] Changes done, test cases left --- python/cudf/cudf/dataframe/categorical.py | 4 ++++ python/cudf/cudf/dataframe/column.py | 6 ++++++ python/cudf/cudf/dataframe/datetime.py | 9 ++++++++ python/cudf/cudf/dataframe/index.py | 25 +++++++++++++++++++++++ python/cudf/cudf/dataframe/numerical.py | 8 ++++++-- python/cudf/cudf/dataframe/series.py | 5 +++++ python/cudf/cudf/dataframe/string.py | 15 ++++++++++++++ python/cudf/cudf/tests/test_contains.py | 10 +++++++++ 8 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/tests/test_contains.py diff --git a/python/cudf/cudf/dataframe/categorical.py b/python/cudf/cudf/dataframe/categorical.py index 31d974e932a..73ba34d0fa2 100644 --- a/python/cudf/cudf/dataframe/categorical.py +++ b/python/cudf/cudf/dataframe/categorical.py @@ -239,6 +239,10 @@ def __init__(self, **kwargs): self._categories = categories self._ordered = ordered + def __contains__(self, item): + print ("In categorical") + return (self._encode(item) in self.as_numerical) + def serialize(self): header, frames = super(CategoricalColumn, self).serialize() header["ordered"] = self._ordered diff --git a/python/cudf/cudf/dataframe/column.py b/python/cudf/cudf/dataframe/column.py index 34de98036d2..627df311eb9 100644 --- a/python/cudf/cudf/dataframe/column.py +++ b/python/cudf/cudf/dataframe/column.py @@ -19,6 +19,7 @@ from cudf.dataframe.buffer import Buffer from cudf.utils import cudautils, ioutils, utils from cudf.utils.dtypes import is_categorical_dtype +from cudf.dataframe import columnops class Column(object): @@ -198,6 +199,11 @@ def __init__(self, data, mask=None, null_count=None, name=None): self._update_null_count(null_count) + def __contains__(self, item): + print ("In columns") + print ("Type of data", type(self._data)) + return item in columnops.as_column(self._data) + def equals(self, other): if self is other: return True diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index 042290bb7dc..bea35ca1ca3 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -49,6 +49,15 @@ def __init__(self, **kwargs): assert self.dtype.type is np.datetime64 self._time_unit, _ = np.datetime_data(self.dtype) + def __contains__(self, item): + print ("RGSL : In date and time") + try: + item = pd.to_datetime(item) + item = columnops.as_column(item).as_numerical[0] + return (item in self.as_numerical) + except: + return False + def serialize(self): header, frames = super(DatetimeColumn, self).serialize() header["type"] = pickle.dumps(type(self)) diff --git a/python/cudf/cudf/dataframe/index.py b/python/cudf/cudf/dataframe/index.py index acd81be4f1b..5dbd59f340f 100644 --- a/python/cudf/cudf/dataframe/index.py +++ b/python/cudf/cudf/dataframe/index.py @@ -43,6 +43,11 @@ def serialize(self): header["frame_count"] = len(frames) return header, frames + def __contains__(self, item): + print ("In index") + print (type(self._values)) + return (item in self._values) + @classmethod def deserialize(cls, header, frames): """ @@ -381,6 +386,10 @@ def __init__(self, start, stop=None, name=None): self.name = name self._cached_values = None + def __contains__(self, item): + print ("In range index") + return (item in self._values) + def copy(self, deep=True): if deep: result = deepcopy(self) @@ -604,6 +613,10 @@ def __init__(self, values, **kwargs): assert isinstance(values, columnops.TypedColumnBase), type(values) + def __contains__(self, item): + print ("In genindex") + return (item in self._values) + def copy(self, deep=True): if deep: result = deepcopy(self) @@ -750,6 +763,10 @@ def __init__(self, values, **kwargs): super(DatetimeIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 + def __contains__(self, item): + print ("DatetimeIndex") + return item in self._values + @property def year(self): return self.get_dt_field("year") @@ -828,6 +845,10 @@ def __init__(self, values, **kwargs): super(CategoricalIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 + def __contains__(self, item): + print ("Categorical index") + return item in self._values + @property def names(self): return [self._values.name] @@ -863,6 +884,10 @@ def __init__(self, values, **kwargs): super(StringIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 + def __contains__(self, item): + print ("String index") + return item in self._values + def to_pandas(self): return pd.Index(self.values, name=self.name, dtype="object") diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index c22254ffb20..297da7f9e29 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -48,9 +48,13 @@ def __contains__(self, item): """ Returns True if column contains item, else False. """ + print ("In numerical") item_found = False - if cudautils.find_first(self.data.mem, item) != -1: - item_found = True + try: + if cudautils.find_first(self.astype('float_').data.mem, float(item)) != -1: + item_found = True + except: + "Nothing to be done" return item_found diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 35d9ad3d6fd..613b97ee9f9 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -128,6 +128,11 @@ def __init__( self._index = RangeIndex(len(data)) if index is None else index self._name = name + def __contains__ (self, item): + print ("In series") + print ("Type of column", type(self._column)) + return item in self._column + @classmethod def from_pandas(cls, s, nan_as_null=True): return cls(s, nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/dataframe/string.py b/python/cudf/cudf/dataframe/string.py index e91ddf1ebbf..0f50626d1bf 100644 --- a/python/cudf/cudf/dataframe/string.py +++ b/python/cudf/cudf/dataframe/string.py @@ -463,6 +463,21 @@ def __init__(self, data, null_count=None, name=None, **kwargs): self._nvcategory = None self._indices = None + def __contains__(self, item): + print ("In string contains") + found = False + try: + if (True in self.str().contains(f"^{item}$")._column): + print (self.str().contains(f"^{item}$")._column) + print ("found") + found = True + except: + "column doesn't have the item" + print ("didn't find") + + return found + + def __reduce__(self): cpumem = self.to_arrow() return columnops.as_column, (cpumem, False, np.dtype("object")) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py new file mode 100644 index 00000000000..fb613124602 --- /dev/null +++ b/python/cudf/cudf/tests/test_contains.py @@ -0,0 +1,10 @@ +import pytest + +import pandas as pd + +from import cudf.dataframe import columnops +from import cudf.dataframe.index import as_index +from import cudf.dataframe.series import Series +from import cudf.dataframe.column import Column + + From d978725103edd224325981467180c8b95b88cd8f Mon Sep 17 00:00:00 2001 From: Markku Luukkainen Date: Wed, 21 Aug 2019 11:35:31 -0700 Subject: [PATCH 19/64] Remove nvidia driver installation --- CHANGELOG.md | 1 + ci/cpu/build.sh | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de6005f5d36..af8e286bc49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ - PR #2660 fix column string category and timeunit concat in the java API - PR #2664 ORC reader: fix `skip_rows` larger than first stripe - PR #2654 Allow Java gdfOrderBy to work with string categories +- PR #2651 Remove nvidia driver installation from ci/cpu/build.sh # cuDF 0.9.0 (Date TBD) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 41f6b83ff8d..a7e420249c7 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -43,21 +43,6 @@ conda list # FIX Added to deal with Anancoda SSL verification issues during conda builds conda config --set ssl_verify False -################################################################################ -# INSTALL - Install NVIDIA driver -################################################################################ - -logger "Install NVIDIA driver for CUDA $CUDA..." -apt-get update -q -DRIVER_VER="396.44-1" -LIBCUDA_VER="396" -if [ "$CUDA" == "10.0" ]; then - DRIVER_VER="410.72-1" - LIBCUDA_VER="410" -fi -DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - cuda-drivers=${DRIVER_VER} libcuda1-${LIBCUDA_VER} - ################################################################################ # BUILD - Conda package builds (conda deps: libcudf <- libcudf_cffi <- cudf) ################################################################################ From d015df18404a4af07b1bb9d0563cbe87c7f96a55 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 12:51:32 -0400 Subject: [PATCH 20/64] update round to handle masked arrays and abstract from issues with integers --- python/cudf/cudf/core/column/numerical.py | 7 ++- python/cudf/cudf/core/series.py | 1 + python/cudf/cudf/tests/test_dataframe.py | 16 +++++-- python/cudf/cudf/utils/cudautils.py | 56 +++++++++++++++++++---- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 54bba3d5a83..fdb61153c78 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -254,7 +254,12 @@ def sum_of_squares(self, dtype=None): return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) def round(self, decimals=0): - data = Buffer(cudautils.apply_round(self.data.mem, decimals)) + mask_dary = None + if self.has_null_mask: + mask_dary = self.nullmask.mem + data = Buffer( + cudautils.apply_round(self.astype('float').data.mem, mask_dary, + decimals)) return self.replace(data=data) def applymap(self, udf, out_dtype=None): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f2d306aedb4..cbc2f78c8d7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1815,6 +1815,7 @@ def round(self, decimals=0): self._column.round(decimals=decimals), name=self.name, index=self.index, + dtype=self.dtype, ) def isin(self, test): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 09ef66c9caf..0cba974bfdd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2539,9 +2539,16 @@ def test_ndim(): assert s.ndim == gs.ndim -@pytest.mark.parametrize("decimal", range(-8, 8)) -def test_round(decimal): - arr = np.random.normal(0, 100, 10000) +@pytest.mark.parametrize( + "arr", [ + np.random.normal(0, 100, 1000), + np.random.randint(-50, 50, 1000), + np.zeros(1000), + np.repeat(np.nan, 1000), + ] +) +@pytest.mark.parametrize("decimal", range(0, 10)) +def test_round(arr, decimal): pser = pd.Series(arr) ser = Series(arr) result = ser.round(decimal) @@ -2551,7 +2558,8 @@ def test_round(decimal): ) # with nulls, maintaining existing null mask - mask = np.random.randint(0, 2, 10000) + arr = arr.astype('float64') # for pandas nulls + mask = np.random.randint(0, 2, 1000) arr[mask == 1] = np.nan pser = pd.Series(arr) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index c9ad209bf11..128bf5a55d2 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -551,21 +551,55 @@ def gpu_round(in_col, out_col, decimal): round_val = 10 ** (-1.0 * decimal) if i < in_col.size: - if not in_col[i]: + current = in_col[i] + + newval = current // round_val * round_val + remainder = fmod(current, round_val) + + if remainder != 0 and remainder > (0.5 * round_val) and current > 0: + newval = newval + round_val + out_col[i] = newval + + elif ( + remainder != 0 + and abs(remainder) < (0.5 * round_val) + and current < 0 + ): + newval = newval + round_val + out_col[i] = newval + + else: + out_col[i] = newval + + +@cuda.jit +def gpu_round_masked(in_col, out_col, mask, decimal): + i = cuda.grid(1) + round_val = 10 ** (-1.0 * decimal) + + if i < in_col.size: + valid = mask_get(mask, i) + current = in_col[i] + + if not valid: out_col[i] = np.nan return - newval = in_col[i] // round_val * round_val - remainder = fmod(in_col[i], round_val) + if current == 0: + out_col[i] = 0 + return + + newval = current // round_val * round_val + remainder = fmod(current, round_val) - if remainder != 0 and remainder > (0.5 * round_val) and in_col[i] > 0: + if remainder != 0 and remainder > (0.5 * round_val) and current > 0: newval = newval + round_val out_col[i] = newval elif ( - remainder != 0 - and abs(remainder) < (0.5 * round_val) - and in_col[i] < 0 + remainder != 0 + and abs(remainder) < (0.5 * round_val) + and current < 0 ): newval = newval + round_val out_col[i] = newval @@ -574,10 +608,14 @@ def gpu_round(in_col, out_col, decimal): out_col[i] = newval -def apply_round(data, decimal): +def apply_round(data, mask, decimal): output_dary = rmm.device_array_like(data) if output_dary.size > 0: - gpu_round.forall(output_dary.size)(data, output_dary, decimal) + if mask is not None: + gpu_round_masked.forall(output_dary.size)(data, output_dary, mask, + decimal) + else: + gpu_round.forall(output_dary.size)(data, output_dary, decimal) return output_dary From 516630be1abf5b764827c0a56dbf1c7079b5ccce Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 12:54:49 -0400 Subject: [PATCH 21/64] change range on a test to -100, 100 --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0cba974bfdd..43b7e7a1266 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2541,7 +2541,7 @@ def test_ndim(): @pytest.mark.parametrize( "arr", [ - np.random.normal(0, 100, 1000), + np.random.normal(-100, 100, 1000), np.random.randint(-50, 50, 1000), np.zeros(1000), np.repeat(np.nan, 1000), From dfd650124ae9b495b8b201f6df4c1c93a13d29e5 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:02:26 -0400 Subject: [PATCH 22/64] enforce decimals >= 0 --- python/cudf/cudf/core/column/numerical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index fdb61153c78..eccfc8b528f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -254,6 +254,10 @@ def sum_of_squares(self, dtype=None): return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) def round(self, decimals=0): + if decimals < 0: + msg = "Decimal values < 0 are not yet supported." + raise NotImplementedError(msg) + mask_dary = None if self.has_null_mask: mask_dary = self.nullmask.mem From c4f232f16bf4a2e0c3b987b64d4b36b8a1f35667 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:05:04 -0400 Subject: [PATCH 23/64] black formatting --- python/cudf/cudf/core/column/numerical.py | 8 +++++--- python/cudf/cudf/tests/test_dataframe.py | 7 ++++--- python/cudf/cudf/utils/cudautils.py | 17 +++++++++-------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index eccfc8b528f..a479537d48b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -257,13 +257,15 @@ def round(self, decimals=0): if decimals < 0: msg = "Decimal values < 0 are not yet supported." raise NotImplementedError(msg) - + mask_dary = None if self.has_null_mask: mask_dary = self.nullmask.mem data = Buffer( - cudautils.apply_round(self.astype('float').data.mem, mask_dary, - decimals)) + cudautils.apply_round( + self.astype("float").data.mem, mask_dary, decimals + ) + ) return self.replace(data=data) def applymap(self, udf, out_dtype=None): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 43b7e7a1266..efeefbad8e5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2540,12 +2540,13 @@ def test_ndim(): @pytest.mark.parametrize( - "arr", [ + "arr", + [ np.random.normal(-100, 100, 1000), np.random.randint(-50, 50, 1000), np.zeros(1000), np.repeat(np.nan, 1000), - ] + ], ) @pytest.mark.parametrize("decimal", range(0, 10)) def test_round(arr, decimal): @@ -2558,7 +2559,7 @@ def test_round(arr, decimal): ) # with nulls, maintaining existing null mask - arr = arr.astype('float64') # for pandas nulls + arr = arr.astype("float64") # for pandas nulls mask = np.random.randint(0, 2, 1000) arr[mask == 1] = np.nan diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 128bf5a55d2..0dea9306f49 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -561,9 +561,9 @@ def gpu_round(in_col, out_col, decimal): out_col[i] = newval elif ( - remainder != 0 - and abs(remainder) < (0.5 * round_val) - and current < 0 + remainder != 0 + and abs(remainder) < (0.5 * round_val) + and current < 0 ): newval = newval + round_val out_col[i] = newval @@ -597,9 +597,9 @@ def gpu_round_masked(in_col, out_col, mask, decimal): out_col[i] = newval elif ( - remainder != 0 - and abs(remainder) < (0.5 * round_val) - and current < 0 + remainder != 0 + and abs(remainder) < (0.5 * round_val) + and current < 0 ): newval = newval + round_val out_col[i] = newval @@ -612,8 +612,9 @@ def apply_round(data, mask, decimal): output_dary = rmm.device_array_like(data) if output_dary.size > 0: if mask is not None: - gpu_round_masked.forall(output_dary.size)(data, output_dary, mask, - decimal) + gpu_round_masked.forall(output_dary.size)( + data, output_dary, mask, decimal + ) else: gpu_round.forall(output_dary.size)(data, output_dary, decimal) return output_dary From b1c164051a9d114d249ccc388dc6eac0c680b66a Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:06:49 -0400 Subject: [PATCH 24/64] revert to old in_col style --- python/cudf/cudf/utils/cudautils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 0dea9306f49..d1773b06dbb 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -551,19 +551,19 @@ def gpu_round(in_col, out_col, decimal): round_val = 10 ** (-1.0 * decimal) if i < in_col.size: - current = in_col[i] + # current = in_col[i] - newval = current // round_val * round_val - remainder = fmod(current, round_val) + newval = in_col[i] // round_val * round_val + remainder = fmod(in_col[i], round_val) - if remainder != 0 and remainder > (0.5 * round_val) and current > 0: + if remainder != 0 and remainder > (0.5 * round_val) and in_col[i] > 0: newval = newval + round_val out_col[i] = newval elif ( remainder != 0 and abs(remainder) < (0.5 * round_val) - and current < 0 + and in_col[i] < 0 ): newval = newval + round_val out_col[i] = newval @@ -579,27 +579,27 @@ def gpu_round_masked(in_col, out_col, mask, decimal): if i < in_col.size: valid = mask_get(mask, i) - current = in_col[i] + # current = in_col[i] if not valid: out_col[i] = np.nan return - if current == 0: + if in_col[i] == 0: out_col[i] = 0 return - newval = current // round_val * round_val - remainder = fmod(current, round_val) + newval = in_col[i] // round_val * round_val + remainder = fmod(in_col[i], round_val) - if remainder != 0 and remainder > (0.5 * round_val) and current > 0: + if remainder != 0 and remainder > (0.5 * round_val) and in_col[i] > 0: newval = newval + round_val out_col[i] = newval elif ( remainder != 0 and abs(remainder) < (0.5 * round_val) - and current < 0 + and in_col[i] < 0 ): newval = newval + round_val out_col[i] = newval From c8eb6e3c307aadfc1d40b167bbf05b65af28ee26 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:08:10 -0400 Subject: [PATCH 25/64] revert back to current reference --- python/cudf/cudf/utils/cudautils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index d1773b06dbb..0dea9306f49 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -551,19 +551,19 @@ def gpu_round(in_col, out_col, decimal): round_val = 10 ** (-1.0 * decimal) if i < in_col.size: - # current = in_col[i] + current = in_col[i] - newval = in_col[i] // round_val * round_val - remainder = fmod(in_col[i], round_val) + newval = current // round_val * round_val + remainder = fmod(current, round_val) - if remainder != 0 and remainder > (0.5 * round_val) and in_col[i] > 0: + if remainder != 0 and remainder > (0.5 * round_val) and current > 0: newval = newval + round_val out_col[i] = newval elif ( remainder != 0 and abs(remainder) < (0.5 * round_val) - and in_col[i] < 0 + and current < 0 ): newval = newval + round_val out_col[i] = newval @@ -579,27 +579,27 @@ def gpu_round_masked(in_col, out_col, mask, decimal): if i < in_col.size: valid = mask_get(mask, i) - # current = in_col[i] + current = in_col[i] if not valid: out_col[i] = np.nan return - if in_col[i] == 0: + if current == 0: out_col[i] = 0 return - newval = in_col[i] // round_val * round_val - remainder = fmod(in_col[i], round_val) + newval = current // round_val * round_val + remainder = fmod(current, round_val) - if remainder != 0 and remainder > (0.5 * round_val) and in_col[i] > 0: + if remainder != 0 and remainder > (0.5 * round_val) and current > 0: newval = newval + round_val out_col[i] = newval elif ( remainder != 0 and abs(remainder) < (0.5 * round_val) - and in_col[i] < 0 + and current < 0 ): newval = newval + round_val out_col[i] = newval From 5c2a627264ccc0ba356bc31c7d2cbe3ef5f95faa Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:11:45 -0400 Subject: [PATCH 26/64] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f903f1b1c87..3e1f9af1ca0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,7 +32,7 @@ - PR #2658 Fix astype() for null categorical columns - PR #2660 fix column string category and timeunit concat in the java API - PR #2664 ORC reader: fix `skip_rows` larger than first stripe - +- PR #2672 Fix null and integer handling in round # cuDF 0.9.0 (Date TBD) From f82c2c4acbfa5a80713de914581e76396e772f8a Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:26:22 -0400 Subject: [PATCH 27/64] changelog (remove extra PR line) --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef92a3493ff..b604448b2e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,7 +32,6 @@ - PR #2658 Fix astype() for null categorical columns - PR #2660 fix column string category and timeunit concat in the java API - PR #2664 ORC reader: fix `skip_rows` larger than first stripe -- PR #2672 Fix null and integer handling in round - PR #2654 Allow Java gdfOrderBy to work with string categories - PR #2672 Fix null and integer handling in round From 3c9d9f0637ceee3c717248077b1b07102db6ed22 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 23 Aug 2019 10:28:36 -0700 Subject: [PATCH 28/64] Map np.longlong to pa.int64 --- python/cudf/cudf/utils/dtypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a516b3128b9..8914cdec28e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -11,6 +11,7 @@ np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), + np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), From 9387f1ac3d45430bdc7c30baf5b35a0cf20d2b48 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:36:52 -0400 Subject: [PATCH 29/64] update tests and formatting --- python/cudf/cudf/tests/test_dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index efeefbad8e5..8d6a3f22844 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2540,13 +2540,13 @@ def test_ndim(): @pytest.mark.parametrize( - "arr", - [ + "arr", [ np.random.normal(-100, 100, 1000), np.random.randint(-50, 50, 1000), - np.zeros(1000), - np.repeat(np.nan, 1000), - ], + np.zeros(100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]) + ] ) @pytest.mark.parametrize("decimal", range(0, 10)) def test_round(arr, decimal): @@ -2559,8 +2559,8 @@ def test_round(arr, decimal): ) # with nulls, maintaining existing null mask - arr = arr.astype("float64") # for pandas nulls - mask = np.random.randint(0, 2, 1000) + arr = arr.astype('float64') # for pandas nulls + mask = np.random.randint(0, 2, arr.shape[0]) arr[mask == 1] = np.nan pser = pd.Series(arr) From c39906ce578299e1c3e82fbdeb4db551eeeb4f43 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 23 Aug 2019 10:48:54 -0700 Subject: [PATCH 30/64] Add mapping of np.longlong to GDF_INT64 --- python/cudf/cudf/_lib/cudf.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/_lib/cudf.pyx b/python/cudf/cudf/_lib/cudf.pyx index e899a706edd..4e0e0e5a516 100644 --- a/python/cudf/cudf/_lib/cudf.pyx +++ b/python/cudf/cudf/_lib/cudf.pyx @@ -27,6 +27,7 @@ dtypes = { np.float64: GDF_FLOAT64, np.float32: GDF_FLOAT32, np.int64: GDF_INT64, + np.longlong: GDF_INT64, np.int32: GDF_INT32, np.int16: GDF_INT16, np.int8: GDF_INT8, From f291ac9661513752daafb552e24a47e8d32a5fbc Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 13:53:58 -0400 Subject: [PATCH 31/64] style --- python/cudf/cudf/tests/test_dataframe.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8d6a3f22844..df0302541c6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2540,13 +2540,14 @@ def test_ndim(): @pytest.mark.parametrize( - "arr", [ + "arr", + [ np.random.normal(-100, 100, 1000), np.random.randint(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]) - ] + np.array([1.123, 2.343, np.nan, 0.0]), + ], ) @pytest.mark.parametrize("decimal", range(0, 10)) def test_round(arr, decimal): @@ -2559,7 +2560,7 @@ def test_round(arr, decimal): ) # with nulls, maintaining existing null mask - arr = arr.astype('float64') # for pandas nulls + arr = arr.astype("float64") # for pandas nulls mask = np.random.randint(0, 2, arr.shape[0]) arr[mask == 1] = np.nan From ede2b0c82ccb306b92b70c4e28e2e9eb03630849 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Fri, 23 Aug 2019 12:55:12 -0500 Subject: [PATCH 32/64] Test cases --- python/cudf/cudf/dataframe/categorical.py | 3 +- python/cudf/cudf/dataframe/column.py | 4 +- python/cudf/cudf/dataframe/datetime.py | 3 +- python/cudf/cudf/dataframe/index.py | 14 +--- python/cudf/cudf/dataframe/numerical.py | 8 ++- python/cudf/cudf/dataframe/series.py | 4 +- python/cudf/cudf/dataframe/string.py | 7 +- python/cudf/cudf/tests/test_contains.py | 82 +++++++++++++++++++++-- 8 files changed, 92 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/dataframe/categorical.py b/python/cudf/cudf/dataframe/categorical.py index 73ba34d0fa2..f252e0a4898 100644 --- a/python/cudf/cudf/dataframe/categorical.py +++ b/python/cudf/cudf/dataframe/categorical.py @@ -240,8 +240,7 @@ def __init__(self, **kwargs): self._ordered = ordered def __contains__(self, item): - print ("In categorical") - return (self._encode(item) in self.as_numerical) + return self._encode(item) in self.as_numerical def serialize(self): header, frames = super(CategoricalColumn, self).serialize() diff --git a/python/cudf/cudf/dataframe/column.py b/python/cudf/cudf/dataframe/column.py index 627df311eb9..9d81d68e4bb 100644 --- a/python/cudf/cudf/dataframe/column.py +++ b/python/cudf/cudf/dataframe/column.py @@ -200,9 +200,7 @@ def __init__(self, data, mask=None, null_count=None, name=None): self._update_null_count(null_count) def __contains__(self, item): - print ("In columns") - print ("Type of data", type(self._data)) - return item in columnops.as_column(self._data) + return item in columnops.as_column(self._data) def equals(self, other): if self is other: diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index bea35ca1ca3..7adad50e5a6 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -50,11 +50,10 @@ def __init__(self, **kwargs): self._time_unit, _ = np.datetime_data(self.dtype) def __contains__(self, item): - print ("RGSL : In date and time") try: item = pd.to_datetime(item) item = columnops.as_column(item).as_numerical[0] - return (item in self.as_numerical) + return item in self.as_numerical except: return False diff --git a/python/cudf/cudf/dataframe/index.py b/python/cudf/cudf/dataframe/index.py index 5dbd59f340f..8defb370d20 100644 --- a/python/cudf/cudf/dataframe/index.py +++ b/python/cudf/cudf/dataframe/index.py @@ -1,6 +1,5 @@ # Copyright (c) 2018, NVIDIA CORPORATION. -from __future__ import division, print_function import pickle from copy import copy, deepcopy @@ -44,9 +43,7 @@ def serialize(self): return header, frames def __contains__(self, item): - print ("In index") - print (type(self._values)) - return (item in self._values) + return item in self._values @classmethod def deserialize(cls, header, frames): @@ -387,8 +384,7 @@ def __init__(self, start, stop=None, name=None): self._cached_values = None def __contains__(self, item): - print ("In range index") - return (item in self._values) + return item in self._values def copy(self, deep=True): if deep: @@ -614,8 +610,7 @@ def __init__(self, values, **kwargs): assert isinstance(values, columnops.TypedColumnBase), type(values) def __contains__(self, item): - print ("In genindex") - return (item in self._values) + return item in self._values def copy(self, deep=True): if deep: @@ -764,7 +759,6 @@ def __init__(self, values, **kwargs): assert self._values.null_count == 0 def __contains__(self, item): - print ("DatetimeIndex") return item in self._values @property @@ -846,7 +840,6 @@ def __init__(self, values, **kwargs): assert self._values.null_count == 0 def __contains__(self, item): - print ("Categorical index") return item in self._values @property @@ -885,7 +878,6 @@ def __init__(self, values, **kwargs): assert self._values.null_count == 0 def __contains__(self, item): - print ("String index") return item in self._values def to_pandas(self): diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 297da7f9e29..c4a8b39b7e0 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -48,10 +48,14 @@ def __contains__(self, item): """ Returns True if column contains item, else False. """ - print ("In numerical") item_found = False try: - if cudautils.find_first(self.astype('float_').data.mem, float(item)) != -1: + if ( + cudautils.find_first( + self.astype("float_").data.mem, float(item) + ) + != -1 + ): item_found = True except: "Nothing to be done" diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 613b97ee9f9..20fa4fc0f04 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -128,9 +128,7 @@ def __init__( self._index = RangeIndex(len(data)) if index is None else index self._name = name - def __contains__ (self, item): - print ("In series") - print ("Type of column", type(self._column)) + def __contains__(self, item): return item in self._column @classmethod diff --git a/python/cudf/cudf/dataframe/string.py b/python/cudf/cudf/dataframe/string.py index 0f50626d1bf..113b2d41ac6 100644 --- a/python/cudf/cudf/dataframe/string.py +++ b/python/cudf/cudf/dataframe/string.py @@ -464,19 +464,14 @@ def __init__(self, data, null_count=None, name=None, **kwargs): self._indices = None def __contains__(self, item): - print ("In string contains") found = False try: - if (True in self.str().contains(f"^{item}$")._column): - print (self.str().contains(f"^{item}$")._column) - print ("found") + if True in self.str().contains(f"^{item}$")._column: found = True except: "column doesn't have the item" - print ("didn't find") return found - def __reduce__(self): cpumem = self.to_arrow() diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index fb613124602..d2ecb298b23 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,10 +1,84 @@ import pytest +from datetime import datetime as dt import pandas as pd -from import cudf.dataframe import columnops -from import cudf.dataframe.index import as_index -from import cudf.dataframe.series import Series -from import cudf.dataframe.column import Column +from cudf.dataframe import columnops +from cudf.dataframe.index import as_index, RangeIndex +from cudf.dataframe.series import Series +from cudf.dataframe.column import Column +from cudf.tests.utils import assert_eq + +def cudf_date_series(start, stop, freq): + return Series(pd.date_range(start, stop, freq=freq, name="times")) + + +def cudf_num_series(start, stop, step=1): + return Series(range(start, stop, step)) + + +def get_categorical_series(): + return Series( + pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + ) + + +def get_string_series(): + return Series(["a", "a", "b", "c", "a"]) + + +## If the type being searched is different from type of series, exceptions +## are thrown well within the python code, and needs to be handled. +## Some of the test cases check this scenario. Example : String Vs Numerical +testdata_all = [ + ( + cudf_date_series("20010101", "20020215", freq="400h"), + dt.strptime("2001-01-01", "%Y-%m-%d"), + True, + ), + ( + cudf_date_series("20010101", "20020215", freq="400h"), + dt.strptime("2000-01-01", "%Y-%m-%d"), + False, + ), + (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False), + (get_categorical_series(), "c", True), + (get_categorical_series(), "d", False), + (get_categorical_series(), 1, False), + (get_string_series(), "c", True), + (get_string_series(), "d", False), + (get_string_series(), 97, False), + (cudf_num_series(0, 100, 5), 60, True), + (cudf_num_series(0, 100, 5), 71, False), + (cudf_num_series(0, 100, 5), "a", False), +] + +testdata_num = [ + (cudf_num_series(0, 100, 5), 60, True), + (cudf_num_series(0, 100, 5), 71, False), + (cudf_num_series(0, 100, 5), "a", False), +] + + +@pytest.mark.parametrize("values, item, expected", testdata_all) +def test_series_contains(values, item, expected): + assert_eq(expected, item in values) + + +@pytest.mark.parametrize("values, item, expected", testdata_all) +def test_index_contains(values, item, expected): + index = as_index(values) + assert_eq(expected, item in index) + + +@pytest.mark.parametrize("values, item, expected", testdata_num) +def test_column_contains(values, item, expected): + col = Column(values.data) + assert_eq(expected, item in col) + + +def test_rangeindex_contains(): + assert_eq(True, 9 in RangeIndex(start=0, stop=10, name="Index")) + assert_eq(False, 10 in RangeIndex(start=0, stop=10, name="Index")) From 12ee21e4220ec3a20dfd9a85ae13f73991dce701 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Fri, 23 Aug 2019 13:09:41 -0500 Subject: [PATCH 33/64] change log --- CHANGELOG.md | 1 + python/cudf/cudf/dataframe/index.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51c346d9de2..4b46b1a2d09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - PR #2522 Add Java bindings for NVStrings backed upper and lower case mutators - PR #2607 Add Java bindings for parsing JSON - PR #2629 Add dropna= parameter to groupby +- PR #2674 Add __contains__ for Index/Series/Column ## Improvements diff --git a/python/cudf/cudf/dataframe/index.py b/python/cudf/cudf/dataframe/index.py index 8defb370d20..c9e357162f8 100644 --- a/python/cudf/cudf/dataframe/index.py +++ b/python/cudf/cudf/dataframe/index.py @@ -1,5 +1,6 @@ # Copyright (c) 2018, NVIDIA CORPORATION. +from __future__ import division, print_function import pickle from copy import copy, deepcopy From 1b0f5443e16a2336ea0f92d421ae4da9d1fb5a7b Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 23 Aug 2019 14:31:31 -0400 Subject: [PATCH 34/64] add xfailing test for better coverage --- python/cudf/cudf/tests/test_dataframe.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index df0302541c6..05cdc5d4302 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2549,7 +2549,28 @@ def test_ndim(): np.array([1.123, 2.343, np.nan, 0.0]), ], ) -@pytest.mark.parametrize("decimal", range(0, 10)) +@pytest.mark.parametrize( + "decimal", + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + pytest.param( + -1, + marks=[ + pytest.mark.xfail(reason="NotImplementedError: decimals < 0") + ], + ), + ], +) def test_round(arr, decimal): pser = pd.Series(arr) ser = Series(arr) From 470eda1763560e8798b575e9554f4dcb0247972a Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Fri, 23 Aug 2019 13:37:47 -0500 Subject: [PATCH 35/64] style changes --- python/cudf/cudf/dataframe/column.py | 2 +- python/cudf/cudf/dataframe/datetime.py | 2 +- python/cudf/cudf/dataframe/numerical.py | 2 +- python/cudf/cudf/dataframe/string.py | 2 +- python/cudf/cudf/tests/test_contains.py | 16 +++++++--------- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/dataframe/column.py b/python/cudf/cudf/dataframe/column.py index 9d81d68e4bb..9bdd5e2b002 100644 --- a/python/cudf/cudf/dataframe/column.py +++ b/python/cudf/cudf/dataframe/column.py @@ -16,10 +16,10 @@ import cudf.bindings.quantile as cpp_quantile from cudf.bindings.concat import _column_concat from cudf.bindings.cudf_cpp import column_view_pointer, count_nonzero_mask +from cudf.dataframe import columnops from cudf.dataframe.buffer import Buffer from cudf.utils import cudautils, ioutils, utils from cudf.utils.dtypes import is_categorical_dtype -from cudf.dataframe import columnops class Column(object): diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index 7adad50e5a6..16fdc2a9d69 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -54,7 +54,7 @@ def __contains__(self, item): item = pd.to_datetime(item) item = columnops.as_column(item).as_numerical[0] return item in self.as_numerical - except: + except Exception: return False def serialize(self): diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index c4a8b39b7e0..4832c1085f4 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -57,7 +57,7 @@ def __contains__(self, item): != -1 ): item_found = True - except: + except Exception: "Nothing to be done" return item_found diff --git a/python/cudf/cudf/dataframe/string.py b/python/cudf/cudf/dataframe/string.py index 113b2d41ac6..b2d04610df5 100644 --- a/python/cudf/cudf/dataframe/string.py +++ b/python/cudf/cudf/dataframe/string.py @@ -468,7 +468,7 @@ def __contains__(self, item): try: if True in self.str().contains(f"^{item}$")._column: found = True - except: + except Exception: "column doesn't have the item" return found diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index d2ecb298b23..5c0b115f26b 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,13 +1,11 @@ -import pytest - from datetime import datetime as dt + import pandas as pd +import pytest -from cudf.dataframe import columnops -from cudf.dataframe.index import as_index, RangeIndex -from cudf.dataframe.series import Series from cudf.dataframe.column import Column - +from cudf.dataframe.index import RangeIndex, as_index +from cudf.dataframe.series import Series from cudf.tests.utils import assert_eq @@ -29,9 +27,9 @@ def get_string_series(): return Series(["a", "a", "b", "c", "a"]) -## If the type being searched is different from type of series, exceptions -## are thrown well within the python code, and needs to be handled. -## Some of the test cases check this scenario. Example : String Vs Numerical +# If the type being searched is different from type of series, exceptions +# are thrown well within the python code, and needs to be handled. +# Some of the test cases check this scenario. Example : String Vs Numerical testdata_all = [ ( cudf_date_series("20010101", "20020215", freq="400h"), From 938ba1b50ae1d34e6d724faf09d59955542a647e Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Fri, 23 Aug 2019 14:00:15 -0500 Subject: [PATCH 36/64] additional test cases --- python/cudf/cudf/tests/test_contains.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 5c0b115f26b..a3e7d96627a 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -19,12 +19,14 @@ def cudf_num_series(start, stop, step=1): def get_categorical_series(): return Series( - pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + pd.Categorical( + ["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"] + ) ) def get_string_series(): - return Series(["a", "a", "b", "c", "a"]) + return Series(["ab", "ac", "ba", "cc", "ad"]) # If the type being searched is different from type of series, exceptions @@ -42,11 +44,14 @@ def get_string_series(): False, ), (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False), - (get_categorical_series(), "c", True), - (get_categorical_series(), "d", False), + (get_categorical_series(), "cd", True), + (get_categorical_series(), "dc", False), + (get_categorical_series(), "c", False), + (get_categorical_series(), "c", False), (get_categorical_series(), 1, False), - (get_string_series(), "c", True), - (get_string_series(), "d", False), + (get_string_series(), "ac", True), + (get_string_series(), "ca", False), + (get_string_series(), "c", False), (get_string_series(), 97, False), (cudf_num_series(0, 100, 5), 60, True), (cudf_num_series(0, 100, 5), 71, False), From 30c410a8856b9cbeb059ac2b734a2ac891b0725d Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 23 Aug 2019 12:06:08 -0700 Subject: [PATCH 37/64] Update some tests to handle np.longlong --- python/cudf/cudf/tests/test_binops.py | 3 ++- python/cudf/cudf/tests/test_dataframe.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index cd2c19923b3..0c0f543b056 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -88,7 +88,7 @@ def test_series_binop_scalar(nelem, binop, obj_class): _bitwise_binops = [operator.and_, operator.or_, operator.xor] -_int_types = ["int8", "int16", "int32", "int64"] +_int_types = ["int8", "int16", "int32", "int64", "longlong"] @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @@ -195,6 +195,7 @@ def test_series_compare(cmpop, obj_class, dtype): "float32", "float64", "datetime64[ms]", + "longlong", ], ) def test_series_compare_scalar(nelem, cmpop, obj_class, dtype): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 09ef66c9caf..307d5deecdf 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1100,6 +1100,7 @@ def test_to_arrow_missing_categorical(): "int16", "int32", "int64", + "longlong", "float32", "float64", "datetime64[ms]", @@ -1133,7 +1134,8 @@ def test_from_scalar_typing(data_type): @pytest.mark.parametrize( - "data_type", ["int8", "int16", "int32", "int64", "float32", "float64"] + "data_type", + ["int8", "int16", "int32", "int64", "float32", "float64", "longlong"], ) def test_from_python_array(data_type): np_arr = np.random.randint(0, 100, 10).astype(data_type) From 9269ffdad1c5dc0efad4c92013c6a6fc28e36e51 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 23 Aug 2019 12:06:52 -0700 Subject: [PATCH 38/64] Add scalar conversion for np.longlong --- python/cudf/cudf/_lib/cudf.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cudf.pyx b/python/cudf/cudf/_lib/cudf.pyx index 4e0e0e5a516..ee30c8331bd 100644 --- a/python/cudf/cudf/_lib/cudf.pyx +++ b/python/cudf/cudf/_lib/cudf.pyx @@ -222,7 +222,7 @@ cdef set_scalar_value(gdf_scalar *scalar, val): scalar.data.fp64 = val elif val.dtype.type == np.float32: scalar.data.fp32 = val - elif val.dtype.type == np.int64: + elif val.dtype.type == np.int64 or val.dtype.type == np.longlong: scalar.data.si64 = val elif val.dtype.type == np.int32: scalar.data.si32 = val From c1fb496987be257598135d0a0094a156941ea54c Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 23 Aug 2019 12:09:28 -0700 Subject: [PATCH 39/64] Doc update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de6005f5d36..949aa2e6da4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - PR #2648 Cython/Python reorg - PR #2588 Update Series.append documentation - PR #2632 Replace dask-cudf set_index code with upstream +- PR #2673 Add support for np.longlong type ## Bug Fixes From ee5aaf99010b9853b3dfd6a7fd4c1f228c49cd92 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Fri, 23 Aug 2019 17:49:41 -0500 Subject: [PATCH 40/64] Review changes --- python/cudf/cudf/dataframe/column.py | 3 +-- python/cudf/cudf/dataframe/datetime.py | 6 +++--- python/cudf/cudf/dataframe/index.py | 17 ++++------------- python/cudf/cudf/dataframe/numerical.py | 18 ++++++++---------- python/cudf/cudf/dataframe/series.py | 2 +- python/cudf/cudf/dataframe/string.py | 9 +-------- python/cudf/cudf/tests/test_contains.py | 15 +-------------- 7 files changed, 19 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/dataframe/column.py b/python/cudf/cudf/dataframe/column.py index 9bdd5e2b002..d1bbcebc4e3 100644 --- a/python/cudf/cudf/dataframe/column.py +++ b/python/cudf/cudf/dataframe/column.py @@ -16,7 +16,6 @@ import cudf.bindings.quantile as cpp_quantile from cudf.bindings.concat import _column_concat from cudf.bindings.cudf_cpp import column_view_pointer, count_nonzero_mask -from cudf.dataframe import columnops from cudf.dataframe.buffer import Buffer from cudf.utils import cudautils, ioutils, utils from cudf.utils.dtypes import is_categorical_dtype @@ -200,7 +199,7 @@ def __init__(self, data, mask=None, null_count=None, name=None): self._update_null_count(null_count) def __contains__(self, item): - return item in columnops.as_column(self._data) + raise (NotImplementedError) def equals(self, other): if self is other: diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index 16fdc2a9d69..aab192c279c 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -50,12 +50,12 @@ def __init__(self, **kwargs): self._time_unit, _ = np.datetime_data(self.dtype) def __contains__(self, item): + # Handles improper item types try: - item = pd.to_datetime(item) - item = columnops.as_column(item).as_numerical[0] - return item in self.as_numerical + item = pd.to_datetime(item).to_datetime64() except Exception: return False + return item in self.as_numerical def serialize(self): header, frames = super(DatetimeColumn, self).serialize() diff --git a/python/cudf/cudf/dataframe/index.py b/python/cudf/cudf/dataframe/index.py index c9e357162f8..951b0483549 100644 --- a/python/cudf/cudf/dataframe/index.py +++ b/python/cudf/cudf/dataframe/index.py @@ -385,7 +385,10 @@ def __init__(self, start, stop=None, name=None): self._cached_values = None def __contains__(self, item): - return item in self._values + if self._start <= item < self._stop: + return True + else: + return False def copy(self, deep=True): if deep: @@ -610,9 +613,6 @@ def __init__(self, values, **kwargs): assert isinstance(values, columnops.TypedColumnBase), type(values) - def __contains__(self, item): - return item in self._values - def copy(self, deep=True): if deep: result = deepcopy(self) @@ -759,9 +759,6 @@ def __init__(self, values, **kwargs): super(DatetimeIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 - def __contains__(self, item): - return item in self._values - @property def year(self): return self.get_dt_field("year") @@ -840,9 +837,6 @@ def __init__(self, values, **kwargs): super(CategoricalIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 - def __contains__(self, item): - return item in self._values - @property def names(self): return [self._values.name] @@ -878,9 +872,6 @@ def __init__(self, values, **kwargs): super(StringIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0 - def __contains__(self, item): - return item in self._values - def to_pandas(self): return pd.Index(self.values, name=self.name, dtype="object") diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 4832c1085f4..2f46e8ab347 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -48,19 +48,17 @@ def __contains__(self, item): """ Returns True if column contains item, else False. """ - item_found = False + # Handles improper item types try: - if ( - cudautils.find_first( - self.astype("float_").data.mem, float(item) - ) - != -1 - ): - item_found = True + item = float(item) except Exception: - "Nothing to be done" + """It means that the item was not a numerical type""" + return False - return item_found + return ( + cudautils.find_first(self.astype("float_").data.mem, float(item)) + != -1 + ) def replace(self, **kwargs): if "data" in kwargs and "dtype" not in kwargs: diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 20fa4fc0f04..56face9b735 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -129,7 +129,7 @@ def __init__( self._name = name def __contains__(self, item): - return item in self._column + return item in self._index @classmethod def from_pandas(cls, s, nan_as_null=True): diff --git a/python/cudf/cudf/dataframe/string.py b/python/cudf/cudf/dataframe/string.py index b2d04610df5..23907d5d112 100644 --- a/python/cudf/cudf/dataframe/string.py +++ b/python/cudf/cudf/dataframe/string.py @@ -464,14 +464,7 @@ def __init__(self, data, null_count=None, name=None, **kwargs): self._indices = None def __contains__(self, item): - found = False - try: - if True in self.str().contains(f"^{item}$")._column: - found = True - except Exception: - "column doesn't have the item" - - return found + return True in self.str().contains(f"^{item}$")._column def __reduce__(self): cpumem = self.to_arrow() diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index a3e7d96627a..4989e800a12 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -3,7 +3,6 @@ import pandas as pd import pytest -from cudf.dataframe.column import Column from cudf.dataframe.index import RangeIndex, as_index from cudf.dataframe.series import Series from cudf.tests.utils import assert_eq @@ -58,16 +57,10 @@ def get_string_series(): (cudf_num_series(0, 100, 5), "a", False), ] -testdata_num = [ - (cudf_num_series(0, 100, 5), 60, True), - (cudf_num_series(0, 100, 5), 71, False), - (cudf_num_series(0, 100, 5), "a", False), -] - @pytest.mark.parametrize("values, item, expected", testdata_all) def test_series_contains(values, item, expected): - assert_eq(expected, item in values) + assert_eq(expected, item in Series(index=values)) @pytest.mark.parametrize("values, item, expected", testdata_all) @@ -76,12 +69,6 @@ def test_index_contains(values, item, expected): assert_eq(expected, item in index) -@pytest.mark.parametrize("values, item, expected", testdata_num) -def test_column_contains(values, item, expected): - col = Column(values.data) - assert_eq(expected, item in col) - - def test_rangeindex_contains(): assert_eq(True, 9 in RangeIndex(start=0, stop=10, name="Index")) assert_eq(False, 10 in RangeIndex(start=0, stop=10, name="Index")) From 2b8bb7ecd1fc6b9f0e5d453a2c92a8474141ab10 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Mon, 26 Aug 2019 15:55:26 -0500 Subject: [PATCH 41/64] multi-index issue and review changes --- python/cudf/cudf/dataframe/datetime.py | 4 ++-- python/cudf/cudf/dataframe/multiindex.py | 2 +- python/cudf/cudf/dataframe/numerical.py | 22 +++++++++++++--------- python/cudf/cudf/dataframe/series.py | 4 ++++ 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index aab192c279c..cdc12daf218 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -52,10 +52,10 @@ def __init__(self, **kwargs): def __contains__(self, item): # Handles improper item types try: - item = pd.to_datetime(item).to_datetime64() + item = np.datetime64(item, self._time_unit) except Exception: return False - return item in self.as_numerical + return item.astype("int_") in self.as_numerical def serialize(self): header, frames = super(DatetimeColumn, self).serialize() diff --git a/python/cudf/cudf/dataframe/multiindex.py b/python/cudf/cudf/dataframe/multiindex.py index 15c26a686bd..eaa59288fa5 100644 --- a/python/cudf/cudf/dataframe/multiindex.py +++ b/python/cudf/cudf/dataframe/multiindex.py @@ -226,7 +226,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): for idx, row in enumerate(row_tuple): if row == slice(None): continue - if row not in index.levels[idx]: + if row not in index.levels[idx].column: raise KeyError(row) return result diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 2f46e8ab347..5d7f29ea523 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -49,16 +49,20 @@ def __contains__(self, item): Returns True if column contains item, else False. """ # Handles improper item types - try: - item = float(item) - except Exception: - """It means that the item was not a numerical type""" + if np.can_cast(item, self.data.mem.dtype): + item = self.data.mem.dtype.type(item) + else: return False - - return ( - cudautils.find_first(self.astype("float_").data.mem, float(item)) - != -1 - ) + # Issue with cudautils with bool araray, always returns True. + if self.data.mem.dtype == np.bool: + return ( + cudautils.find_first( + self.astype("int_").data.mem, np.int_(item) + ) + != -1 + ) + else: + return cudautils.find_first(self.data.mem, item) != -1 def replace(self, **kwargs): if "data" in kwargs and "dtype" not in kwargs: diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 56face9b735..2a3526112aa 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -144,6 +144,10 @@ def values(self): else: return self.data.mem.copy_to_host() + @property + def column(self): + return self._column + @classmethod def from_arrow(cls, s): return cls(s) From 058d465f3b0942ac6362bdd301cdce02157ea83f Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Tue, 27 Aug 2019 09:55:05 -0500 Subject: [PATCH 42/64] dask_cudf failuers --- python/cudf/cudf/dataframe/numerical.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 5d7f29ea523..47fa4bfdc8e 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -49,9 +49,13 @@ def __contains__(self, item): Returns True if column contains item, else False. """ # Handles improper item types - if np.can_cast(item, self.data.mem.dtype): - item = self.data.mem.dtype.type(item) - else: + # Fails if item is of type None, so the handler. + try: + if np.can_cast(item, self.data.mem.dtype): + item = self.data.mem.dtype.type(item) + else: + return False + except Exception: return False # Issue with cudautils with bool araray, always returns True. if self.data.mem.dtype == np.bool: From 6980a0fe60f1a12fc13b371a05eea2256964ac95 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Tue, 27 Aug 2019 12:27:24 -0400 Subject: [PATCH 43/64] simplify null handling logic and add tests --- python/cudf/cudf/tests/test_dataframe.py | 21 +++++++++++ python/cudf/cudf/utils/cudautils.py | 48 +++++++++++------------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 05cdc5d4302..9f5b9802a8d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2595,6 +2595,27 @@ def test_round(arr, decimal): np.array_equal(ser.nullmask.to_array(), result.nullmask.to_array()) +@pytest.mark.parametrize( + "series", + [ + Series([1.0, None, np.nan, 4.0], nan_as_null=False), + Series([1.24430, None, np.nan, 4.423530], nan_as_null=False), + Series([1.24430, np.nan, 4.423530], nan_as_null=False), + Series([-1.24430, np.nan, -4.423530], nan_as_null=False), + Series(np.repeat(np.nan, 100)), + ], +) +@pytest.mark.parametrize("decimal", [0, 1, 2, 3]) +def test_round_nan_as_null_false(series, decimal): + pser = series.to_pandas() + ser = Series(series) + result = ser.round(decimal) + expected = pser.round(decimal) + np.testing.assert_array_almost_equal( + result.to_pandas(), expected, decimal=10 + ) + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 0dea9306f49..ec9d006db32 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -552,7 +552,6 @@ def gpu_round(in_col, out_col, decimal): if i < in_col.size: current = in_col[i] - newval = current // round_val * round_val remainder = fmod(current, round_val) @@ -579,33 +578,30 @@ def gpu_round_masked(in_col, out_col, mask, decimal): if i < in_col.size: valid = mask_get(mask, i) - current = in_col[i] - - if not valid: - out_col[i] = np.nan - return - - if current == 0: - out_col[i] = 0 - return - - newval = current // round_val * round_val - remainder = fmod(current, round_val) - - if remainder != 0 and remainder > (0.5 * round_val) and current > 0: - newval = newval + round_val - out_col[i] = newval - elif ( - remainder != 0 - and abs(remainder) < (0.5 * round_val) - and current < 0 - ): - newval = newval + round_val - out_col[i] = newval + if valid: + current = in_col[i] + newval = current // round_val * round_val + remainder = fmod(current, round_val) + + if ( + remainder != 0 + and remainder > (0.5 * round_val) + and current > 0 + ): + newval = newval + round_val + out_col[i] = newval + + elif ( + remainder != 0 + and abs(remainder) < (0.5 * round_val) + and current < 0 + ): + newval = newval + round_val + out_col[i] = newval - else: - out_col[i] = newval + else: + out_col[i] = newval def apply_round(data, mask, decimal): From 96db60b36cab7d152b6271dfd46b7074efdf7d62 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 28 Aug 2019 11:18:35 -0700 Subject: [PATCH 44/64] ensure csv_reader sets gdf_dtype_extra_info time_unit to ms for datetime cols --- cpp/src/io/csv/csv_reader_impl.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/csv/csv_reader_impl.cu b/cpp/src/io/csv/csv_reader_impl.cu index dfa70716974..08a3a7e4b48 100644 --- a/cpp/src/io/csv/csv_reader_impl.cu +++ b/cpp/src/io/csv/csv_reader_impl.cu @@ -630,8 +630,9 @@ table reader::Impl::read() std::vector columns; for (int col = 0, active_col = 0; col < num_actual_cols; ++col) { if (h_column_flags[col] & column_parse::enabled) { + auto time_unit = dtypes[active_col] == GDF_DATE64 ? TIME_UNIT_ms : TIME_UNIT_NONE; columns.emplace_back(num_records, dtypes[active_col], - gdf_dtype_extra_info{TIME_UNIT_NONE}, + gdf_dtype_extra_info{time_unit}, col_names[col]); CUDF_EXPECTS(columns.back().allocate() == GDF_SUCCESS, "Cannot allocate columns"); active_col++; From ebd3948c6671026ff585d53734489007ea2bc9c4 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 28 Aug 2019 11:18:58 -0700 Subject: [PATCH 45/64] add csv_reader test to ensure time units are preserved and can be cast --- cpp/tests/io/csv/csv_test.cu | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cpp/tests/io/csv/csv_test.cu b/cpp/tests/io/csv/csv_test.cu index acc2af52ea5..16f4bc7b881 100644 --- a/cpp/tests/io/csv/csv_test.cu +++ b/cpp/tests/io/csv/csv_test.cu @@ -306,6 +306,37 @@ TEST(gdf_csv_test, Dates) } } +TEST(gdf_csv_test, Timestamps) +{ + const std::string fname = temp_env->get_temp_dir()+"CsvTimestamps.csv"; + + std::ofstream outfile(fname, std::ofstream::out); + outfile << "true,334.0,2014-02-01T12:30:23.000-06:00\n"; + outfile.close(); + ASSERT_TRUE( checkFile(fname) ); + + { + cudf::csv_read_arg args(cudf::source_info{fname}); + args.names = { "A" }; + args.dtype = { "timestamp" }; + args.dayfirst = true; + args.header = -1; + const auto df = cudf::read_csv(args); + + EXPECT_EQ( df.num_columns(), static_cast(args.names.size()) ); + ASSERT_EQ( df.get_column(0)->dtype, GDF_TIMESTAMP ); + ASSERT_EQ( df.get_column(0)->gdf_dtype_extra_info.time_unit, TIME_UNIT_ms ); + auto ACol = gdf_host_column(df.get_column(0)); + std::cerr << "Time Unit= " << df.get_column(0)->dtype_info.time_unit; + + gdf_column output; + gdf_dtype_extra_info info{}; + info.time_unit = TIME_UNIT_us; + output = cudf::cast(*df.get_column(0), GDF_TIMESTAMP, info); + + } +} + TEST(gdf_csv_test, FloatingPoint) { const std::string fname = temp_env->get_temp_dir()+"CsvFloatingPoint.csv"; From 3c5c3e9a9f1dec1adef9aa541b8ae360f7b5abcd Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 28 Aug 2019 11:25:27 -0700 Subject: [PATCH 46/64] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3464dee01f..668e159f782 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ - PR #2669 AVRO reader: fix non-deterministic output - PR #2668 Update Java bindings to specify timestamp units for ORC and Parquet readers - PR #2679 AVRO reader: fix cuda errors when decoding compressed streams +- PR #2697 Ensure csv reader sets datetime column time units # cuDF 0.9.0 (Date TBD) From 0619a6cd7913f04477e20a1c97b60fb61275ebd2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Aug 2019 11:59:56 -0700 Subject: [PATCH 47/64] rangeindex slices return rangeindex if step is 1 --- python/cudf/cudf/core/index.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1160b7e54fc..114b3b2838b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -418,6 +418,8 @@ def __getitem__(self, index): stop += self._start if sln == 0: return RangeIndex(0) + elif step == 1: + return RangeIndex(start, stop) else: return index_from_range(start, stop, step) From 39c5cd192ade60e237a46f72e91482a26837c539 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Aug 2019 12:13:05 -0700 Subject: [PATCH 48/64] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f903f1b1c87..1a36ffaacd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,7 +32,7 @@ - PR #2658 Fix astype() for null categorical columns - PR #2660 fix column string category and timeunit concat in the java API - PR #2664 ORC reader: fix `skip_rows` larger than first stripe - +- PR #2698 Return RangeIndex from contiguous slice of RangeIndex # cuDF 0.9.0 (Date TBD) From b9673692651bac801015a75e83b544a8471fe11b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 28 Aug 2019 12:16:16 -0700 Subject: [PATCH 49/64] small style error --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a36ffaacd3..96b37449078 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ - PR #2664 ORC reader: fix `skip_rows` larger than first stripe - PR #2698 Return RangeIndex from contiguous slice of RangeIndex + # cuDF 0.9.0 (Date TBD) ## New Features From 9d3718534339034eebbec3f5b6bc767c157c1f3d Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 28 Aug 2019 14:44:31 -0700 Subject: [PATCH 50/64] fix new csv timestamp test --- cpp/src/io/csv/csv_reader_impl.cu | 5 ++++- cpp/tests/io/csv/csv_test.cu | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/csv/csv_reader_impl.cu b/cpp/src/io/csv/csv_reader_impl.cu index 08a3a7e4b48..72a43daa9cc 100644 --- a/cpp/src/io/csv/csv_reader_impl.cu +++ b/cpp/src/io/csv/csv_reader_impl.cu @@ -630,7 +630,10 @@ table reader::Impl::read() std::vector columns; for (int col = 0, active_col = 0; col < num_actual_cols; ++col) { if (h_column_flags[col] & column_parse::enabled) { - auto time_unit = dtypes[active_col] == GDF_DATE64 ? TIME_UNIT_ms : TIME_UNIT_NONE; + auto time_unit = TIME_UNIT_NONE; + if (dtypes[active_col] == GDF_DATE64 || dtypes[active_col] == GDF_TIMESTAMP) { + time_unit = TIME_UNIT_ms; + } columns.emplace_back(num_records, dtypes[active_col], gdf_dtype_extra_info{time_unit}, col_names[col]); diff --git a/cpp/tests/io/csv/csv_test.cu b/cpp/tests/io/csv/csv_test.cu index 16f4bc7b881..97875e4d8cb 100644 --- a/cpp/tests/io/csv/csv_test.cu +++ b/cpp/tests/io/csv/csv_test.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -325,15 +326,16 @@ TEST(gdf_csv_test, Timestamps) EXPECT_EQ( df.num_columns(), static_cast(args.names.size()) ); ASSERT_EQ( df.get_column(0)->dtype, GDF_TIMESTAMP ); - ASSERT_EQ( df.get_column(0)->gdf_dtype_extra_info.time_unit, TIME_UNIT_ms ); + ASSERT_EQ( df.get_column(0)->dtype_info.time_unit, TIME_UNIT_ms ); auto ACol = gdf_host_column(df.get_column(0)); std::cerr << "Time Unit= " << df.get_column(0)->dtype_info.time_unit; - gdf_column output; - gdf_dtype_extra_info info{}; - info.time_unit = TIME_UNIT_us; - output = cudf::cast(*df.get_column(0), GDF_TIMESTAMP, info); - + gdf_column output; + gdf_dtype_extra_info info{}; + info.time_unit = TIME_UNIT_us; + output = cudf::cast(*df.get_column(0), GDF_TIMESTAMP, info); + ASSERT_EQ( output.dtype, GDF_TIMESTAMP ); + ASSERT_EQ( output.dtype_info.time_unit, TIME_UNIT_us ); } } From 38de8a78c9f40fe3a704534e55348b8f488bd96f Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Wed, 28 Aug 2019 18:52:39 -0700 Subject: [PATCH 51/64] add dask serializition dispatching --- python/cudf/cudf/comm/serialize.py | 44 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py index ac23b0fb702..2deb2afb67c 100644 --- a/python/cudf/cudf/comm/serialize.py +++ b/python/cudf/cudf/comm/serialize.py @@ -1,23 +1,29 @@ -import functools +import pickle +from distributed.protocol.cuda import cuda_deserialize, cuda_serialize +from distributed.utils import log_errors -def register_distributed_serializer(cls): - try: - from distributed.protocol.cuda import cuda_serialize, cuda_deserialize - from distributed.protocol import serialize, deserialize +import cudf +import cudf.core.groupby.groupby - serialize_part = functools.partial( - serialize, serializers=["cuda", "dask", "pickle"] - ) - deserialize_part = functools.partial( - deserialize, deserializers=["cuda", "dask", "pickle"] - ) - cuda_serialize.register(cls)( - functools.partial(cls.serialize, serialize=serialize_part) - ) - cuda_deserialize.register(cls)( - functools.partial(cls.deserialize, deserialize_part) - ) - except ImportError: - pass +# all (de-)serializtion code lives in the cudf codebase +# here we ammend the returned headers with `is_gpu` for +# UCX buffer consumption +@cuda_serialize.register( + (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) +) +def serialize_cudf_dataframe(x): + with log_errors(): + header, frames = x.serialize() + return header, frames + + +@cuda_deserialize.register( + (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) +) +def deserialize_cudf_dataframe(header, frames): + with log_errors(): + cudf_typ = pickle.loads(header["type"]) + cudf_obj = cudf_typ.deserialize(header, frames) + return cudf_obj From 3bc3d289beaa30d3b03012fb28b2166744c0b770 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Wed, 28 Aug 2019 19:51:06 -0700 Subject: [PATCH 52/64] only load distributed if it is installed --- python/cudf/cudf/comm/serialize.py | 45 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py index 2deb2afb67c..8d951c0867a 100644 --- a/python/cudf/cudf/comm/serialize.py +++ b/python/cudf/cudf/comm/serialize.py @@ -1,29 +1,32 @@ import pickle -from distributed.protocol.cuda import cuda_deserialize, cuda_serialize -from distributed.utils import log_errors - import cudf import cudf.core.groupby.groupby +try: + from distributed.protocol.cuda import cuda_deserialize, cuda_serialize + from distributed.utils import log_errors + + # all (de-)serializtion are attached to cudf Objects: + # Series/DataFrame/Index/Column/Buffer/etc + @cuda_serialize.register( + (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) + ) + def serialize_cudf_dataframe(x): + with log_errors(): + header, frames = x.serialize() + return header, frames -# all (de-)serializtion code lives in the cudf codebase -# here we ammend the returned headers with `is_gpu` for -# UCX buffer consumption -@cuda_serialize.register( - (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) -) -def serialize_cudf_dataframe(x): - with log_errors(): - header, frames = x.serialize() - return header, frames + @cuda_deserialize.register( + (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) + ) + def deserialize_cudf_dataframe(header, frames): + with log_errors(): + cudf_typ = pickle.loads(header["type"]) + cudf_obj = cudf_typ.deserialize(header, frames) + return cudf_obj -@cuda_deserialize.register( - (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) -) -def deserialize_cudf_dataframe(header, frames): - with log_errors(): - cudf_typ = pickle.loads(header["type"]) - cudf_obj = cudf_typ.deserialize(header, frames) - return cudf_obj +except ImportError: + # distributed is probably not installed on the system + pass From 04a6a45cf19927b712edfcd3caee0ed74db9e48d Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Thu, 29 Aug 2019 09:30:30 -0700 Subject: [PATCH 53/64] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b02be77b678..0bcf638d2d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - PR #2648 Cython/Python reorg - PR #2588 Update Series.append documentation - PR #2632 Replace dask-cudf set_index code with upstream +- PR #2703 move dask serialization dispatch into cudf ## Bug Fixes From 7cc5345bcbe80f6713bd3693e25c899893065ee1 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Thu, 29 Aug 2019 13:11:50 -0500 Subject: [PATCH 54/64] changes with _column --- python/cudf/cudf/dataframe/multiindex.py | 2 +- python/cudf/cudf/dataframe/series.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/dataframe/multiindex.py b/python/cudf/cudf/dataframe/multiindex.py index eaa59288fa5..d6b2f7a9db0 100644 --- a/python/cudf/cudf/dataframe/multiindex.py +++ b/python/cudf/cudf/dataframe/multiindex.py @@ -226,7 +226,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): for idx, row in enumerate(row_tuple): if row == slice(None): continue - if row not in index.levels[idx].column: + if row not in index.levels[idx]._column: raise KeyError(row) return result diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index 2a3526112aa..56face9b735 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -144,10 +144,6 @@ def values(self): else: return self.data.mem.copy_to_host() - @property - def column(self): - return self._column - @classmethod def from_arrow(cls, s): return cls(s) From ae18b8845549d29ef4139aec4e3346707095bf59 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 29 Aug 2019 12:56:20 -0700 Subject: [PATCH 55/64] test_rangeindex_get_slice_bound test for RangeIndex --- python/cudf/cudf/tests/test_monotonic.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 0a9bd5edbd9..fd530d72840 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -236,6 +236,23 @@ def test_get_slice_bound(testlist, side, kind): ) == index_pd.get_slice_bound(label, side, kind) +@pytest.mark.parametrize("bounds", [(0, 10), (0, 1), (3, 4), (0, 0), (3, 3)]) +@pytest.mark.parametrize( + "indices", + [[-1, 0, 5, 10, 11], [-1, 0, 1, 2], [2, 3, 4, 5], [-1, 0, 1], [2, 3, 4]], +) +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("kind", ["getitem", "loc", "ix"]) +def test_rangeindex_get_slice_bound(bounds, indices, side, kind): + start, stop = bounds + pd_index = pd.RangeIndex(start, stop) + cudf_index = RangeIndex(start, stop) + for idx in indices: + expect = pd_index.get_slice_bound(idx, side, kind) + got = cudf_index.get_slice_bound(idx, side, kind) + assert expect == got + + @pytest.mark.parametrize("label", [1, 3, 5, 7, 9, 11]) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("kind", ["ix", "loc", "getitem", None]) From b5884eeb32fbe31545992695fdf501d39270be4d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 29 Aug 2019 12:56:38 -0700 Subject: [PATCH 56/64] implement get_slice_bound for rangeindex --- python/cudf/cudf/core/index.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 114b3b2838b..36aa82663e2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -552,8 +552,15 @@ def is_monotonic_decreasing(self): return self._start >= self._stop def get_slice_bound(self, label, side, kind): - # TODO: Range-specific implementation here - raise (NotImplementedError) + if label < self._start: + return 0 + elif label >= self._stop: + return len(self) + else: + if side == "left": + return label - self._start + elif side == "right": + return (label - self._start) + 1 @property def __cuda_array_interface__(self): From 9ea151ef08c4ae11f437cc6ce9cbe50606c2ff36 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Thu, 29 Aug 2019 15:23:54 -0500 Subject: [PATCH 57/64] review changes --- python/cudf/cudf/dataframe/numerical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 47fa4bfdc8e..fa815eb3550 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -61,7 +61,7 @@ def __contains__(self, item): if self.data.mem.dtype == np.bool: return ( cudautils.find_first( - self.astype("int_").data.mem, np.int_(item) + self.data.mem.view('int8'), item.view('int8') ) != -1 ) From d8409fb10d4e16085d77ca8403da4380b108edda Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Fri, 30 Aug 2019 11:29:33 -0700 Subject: [PATCH 58/64] add columns to serialize register -- useful for testing purposes --- python/cudf/cudf/comm/serialize.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py index 8d951c0867a..127172378eb 100644 --- a/python/cudf/cudf/comm/serialize.py +++ b/python/cudf/cudf/comm/serialize.py @@ -10,7 +10,13 @@ # all (de-)serializtion are attached to cudf Objects: # Series/DataFrame/Index/Column/Buffer/etc @cuda_serialize.register( - (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) + ( + cudf.DataFrame, + cudf.Series, + cudf.core.series.Series, + cudf.core.groupby.groupby._Groupby, + cudf.core.column.column.Column, + ) ) def serialize_cudf_dataframe(x): with log_errors(): @@ -18,7 +24,13 @@ def serialize_cudf_dataframe(x): return header, frames @cuda_deserialize.register( - (cudf.DataFrame, cudf.Series, cudf.core.groupby.groupby._Groupby) + ( + cudf.DataFrame, + cudf.Series, + cudf.core.series.Series, + cudf.core.groupby.groupby._Groupby, + cudf.core.column.column.Column, + ) ) def deserialize_cudf_dataframe(header, frames): with log_errors(): From ca0889ce7b6282c9c476b203c17cfe264884f404 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Fri, 30 Aug 2019 11:29:53 -0700 Subject: [PATCH 59/64] remove serialize register in legacy groupby --- python/cudf/cudf/core/groupby/legacy_groupby.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/legacy_groupby.py b/python/cudf/cudf/core/groupby/legacy_groupby.py index b0bdd5ab69e..341a558ba8e 100644 --- a/python/cudf/cudf/core/groupby/legacy_groupby.py +++ b/python/cudf/cudf/core/groupby/legacy_groupby.py @@ -10,7 +10,6 @@ import cudf import cudf._lib as libcudf -from cudf.comm.serialize import register_distributed_serializer from cudf.core.series import Series @@ -530,6 +529,3 @@ def rolling_avg(val, avg): df, segs = self.as_df() kwargs.update({"chunks": segs}) return df.apply_chunks(function, **kwargs) - - -register_distributed_serializer(Groupby) From edee6d5f8cd1ade22d5d896d5a822f0647bb8b90 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Tue, 3 Sep 2019 11:18:34 -0400 Subject: [PATCH 60/64] remove masked-specific codepath for kernel --- python/cudf/cudf/core/column/numerical.py | 7 +--- python/cudf/cudf/utils/cudautils.py | 42 ++--------------------- 2 files changed, 3 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a479537d48b..cc620d76c98 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -258,13 +258,8 @@ def round(self, decimals=0): msg = "Decimal values < 0 are not yet supported." raise NotImplementedError(msg) - mask_dary = None - if self.has_null_mask: - mask_dary = self.nullmask.mem data = Buffer( - cudautils.apply_round( - self.astype("float").data.mem, mask_dary, decimals - ) + cudautils.apply_round(self.astype("float").data.mem, decimals) ) return self.replace(data=data) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index ec9d006db32..b966849af83 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -571,48 +571,10 @@ def gpu_round(in_col, out_col, decimal): out_col[i] = newval -@cuda.jit -def gpu_round_masked(in_col, out_col, mask, decimal): - i = cuda.grid(1) - round_val = 10 ** (-1.0 * decimal) - - if i < in_col.size: - valid = mask_get(mask, i) - - if valid: - current = in_col[i] - newval = current // round_val * round_val - remainder = fmod(current, round_val) - - if ( - remainder != 0 - and remainder > (0.5 * round_val) - and current > 0 - ): - newval = newval + round_val - out_col[i] = newval - - elif ( - remainder != 0 - and abs(remainder) < (0.5 * round_val) - and current < 0 - ): - newval = newval + round_val - out_col[i] = newval - - else: - out_col[i] = newval - - -def apply_round(data, mask, decimal): +def apply_round(data, decimal): output_dary = rmm.device_array_like(data) if output_dary.size > 0: - if mask is not None: - gpu_round_masked.forall(output_dary.size)( - data, output_dary, mask, decimal - ) - else: - gpu_round.forall(output_dary.size)(data, output_dary, decimal) + gpu_round.forall(output_dary.size)(data, output_dary, decimal) return output_dary From f79c2cd01286b9ade27af786305706a166149f7a Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Tue, 3 Sep 2019 10:44:31 -0500 Subject: [PATCH 61/64] style changes --- python/cudf/cudf/dataframe/numerical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index fa815eb3550..65aa40311cd 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -61,7 +61,7 @@ def __contains__(self, item): if self.data.mem.dtype == np.bool: return ( cudautils.find_first( - self.data.mem.view('int8'), item.view('int8') + self.data.mem.view("int8"), item.view("int8") ) != -1 ) From 83e2ce635abc8041cdaeced4ee2c5d9c2e39ef53 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Tue, 3 Sep 2019 13:19:27 -0500 Subject: [PATCH 62/64] merge changes --- CHANGELOG.md | 1 + python/cudf/cudf/tests/test_contains.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f9d45e24b9..e75b371b580 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - PR #2629 Add dropna= parameter to groupby - PR #2585 ORC & Parquet Readers: Remove millisecond timestamp restriction - PR #2653 Add Java bindings for rolling window operations +- PR #2674 Add __contains__ for Index/Series/Column ## Improvements diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 4989e800a12..6ee680a1601 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -3,8 +3,8 @@ import pandas as pd import pytest -from cudf.dataframe.index import RangeIndex, as_index -from cudf.dataframe.series import Series +from cudf.core.index import RangeIndex, as_index +from cudf import Series from cudf.tests.utils import assert_eq From 8b6bfac85dd67001f6f9381645aa938d8e3ab818 Mon Sep 17 00:00:00 2001 From: rgsl888prabhu Date: Tue, 3 Sep 2019 13:22:21 -0500 Subject: [PATCH 63/64] style changes --- python/cudf/cudf/tests/test_contains.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 6ee680a1601..4737faf65a4 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -3,8 +3,8 @@ import pandas as pd import pytest -from cudf.core.index import RangeIndex, as_index from cudf import Series +from cudf.core.index import RangeIndex, as_index from cudf.tests.utils import assert_eq From 4563ac896de9c38d4f9200ca8e055857cf1ccece Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Tue, 3 Sep 2019 16:32:49 -0400 Subject: [PATCH 64/64] short circuit integer handling --- python/cudf/cudf/core/column/numerical.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 2ce9a21dfa6..03c404d588a 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -269,9 +269,10 @@ def round(self, decimals=0): msg = "Decimal values < 0 are not yet supported." raise NotImplementedError(msg) - data = Buffer( - cudautils.apply_round(self.astype("float").data.mem, decimals) - ) + if np.issubdtype(self.dtype, np.integer): + return self + + data = Buffer(cudautils.apply_round(self.data.mem, decimals)) return self.replace(data=data) def applymap(self, udf, out_dtype=None):