From 17d7ab39191af9988ab9d99e1cd72579d5c79639 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Thu, 20 May 2021 19:55:40 +0000 Subject: [PATCH 1/5] added _is_homogeneous property --- python/cudf/cudf/core/dataframe.py | 9 +++++++++ python/cudf/cudf/tests/test_dataframe.py | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f2be0e3bd6e..61a86a2ba4d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -602,6 +602,15 @@ def deserialize(cls, header, frames): return cls(dict(zip(column_names, columns)), index=index) + @property + def _is_homogeneous(self): + # make sure that the dataframe has columns + if not self._data.columns: + return True + + first_type = self._data.columns[0].dtype + return all(x.dtype == first_type for x in self._data.columns) + @property def dtypes(self): """ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5e36ba7e21..082c5a32554 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8579,3 +8579,19 @@ def test_dataframe_init_from_series(data, columns, index): actual, check_index_type=False if len(expected) == 0 else True, ) + + +@pytest.mark.parametrize( + "data,expected", + [ + ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), + ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), + ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), + ({"a": [True, False, False], "b": [False, False, True]}, True), + ({}, True), + ], +) +def test_is_homogeneous(data, expected): + actual = cudf.DataFrame(data)._is_homogeneous + + assert actual == expected From 15ced45c89e867b46fe30534f85949d3c522156d Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Thu, 20 May 2021 20:39:21 +0000 Subject: [PATCH 2/5] moved _is_homogeneous to Frame --- python/cudf/cudf/core/dataframe.py | 9 --------- python/cudf/cudf/core/frame.py | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 61a86a2ba4d..f2be0e3bd6e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -602,15 +602,6 @@ def deserialize(cls, header, frames): return cls(dict(zip(column_names, columns)), index=index) - @property - def _is_homogeneous(self): - # make sure that the dataframe has columns - if not self._data.columns: - return True - - first_type = self._data.columns[0].dtype - return all(x.dtype == first_type for x in self._data.columns) - @property def dtypes(self): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f59954aaf08..b177b96cffc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -156,6 +156,15 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def _is_homogeneous(self): + # make sure that the dataframe has columns + if not self._data.columns: + return True + + first_type = self._data.columns[0].dtype + return all(x.dtype == first_type for x in self._data.columns) + @property def empty(self): """ From c06be48ccc01c23e9d933a9cac6dc31ed4918f19 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Fri, 21 May 2021 14:59:27 +0000 Subject: [PATCH 3/5] Added more testcases and changed how equality is handled --- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 56 +++++++++++++++++++++++- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b177b96cffc..012e2f3788a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -162,8 +162,8 @@ def _is_homogeneous(self): if not self._data.columns: return True - first_type = self._data.columns[0].dtype - return all(x.dtype == first_type for x in self._data.columns) + first_type = self._data.columns[0].dtype.name + return all(x.dtype.name == first_type for x in self._data.columns) @property def empty(self): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 082c5a32554..e2d173cc6c3 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8582,12 +8582,16 @@ def test_dataframe_init_from_series(data, columns, index): @pytest.mark.parametrize( - "data,expected", + "data, expected", [ ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), ({"a": [True, False, False], "b": [False, False, True]}, True), + ({"a": [True, False, False]}, True), + ({"a": [[1, 2], [3, 4]]}, True), + ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False), + ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True), ({}, True), ], ) @@ -8595,3 +8599,53 @@ def test_is_homogeneous(data, expected): actual = cudf.DataFrame(data)._is_homogeneous assert actual == expected + + +@pytest.mark.parametrize( + "data, indexes, expected", + [ + ( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, + ["a", "b"], + True, + ), + ( + { + "a": [1, 2, 3, 4], + "b": [5, 6, 7, 8], + "c": [1.2, 1, 2, 3], + "d": ["hello", "world", "cudf", "rapids"], + }, + ["a", "b"], + False, + ), + ( + { + "a": ["a", "b", "c"], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [1, 2, 3], + }, + ["a", "b"], + True, + ), + ], +) +def test_is_homogeneous_multiindex(data, indexes, expected): + test_dataframe = cudf.DataFrame(data).set_index(indexes) + actual = cudf.DataFrame(test_dataframe)._is_homogeneous + + assert actual == expected + + +""" +({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), + ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), + ({"a": [True, False, False], "b": [False, False, True]}, True), + ({"a": [True, False, False]}, True), + ({"a": [[1,2],[3,4]]}, True), + ({'a': [[1,2], [3,4]], 'b': ["a", "b"]}, False), + ({'a': [{'c':5} , {'e': 5}], 'b': [{'c':5} , {'g': 7}]}, True), + ({}, True), + +""" From 4861e9a6496bc8a62668c10bf33d490a1c97bc21 Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Fri, 21 May 2021 15:01:05 +0000 Subject: [PATCH 4/5] removing unused testcases --- python/cudf/cudf/tests/test_dataframe.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e2d173cc6c3..93c68387bed 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8636,16 +8636,3 @@ def test_is_homogeneous_multiindex(data, indexes, expected): actual = cudf.DataFrame(test_dataframe)._is_homogeneous assert actual == expected - - -""" -({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), - ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), - ({"a": [True, False, False], "b": [False, False, True]}, True), - ({"a": [True, False, False]}, True), - ({"a": [[1,2],[3,4]]}, True), - ({'a': [[1,2], [3,4]], 'b': ["a", "b"]}, False), - ({'a': [{'c':5} , {'e': 5}], 'b': [{'c':5} , {'g': 7}]}, True), - ({}, True), - -""" From e9311bf8e8d36334b65e674c0cda874e2054499c Mon Sep 17 00:00:00 2001 From: Shane Ding Date: Fri, 21 May 2021 18:11:20 +0000 Subject: [PATCH 5/5] Added more test cases for multiIndex, series and Index --- python/cudf/cudf/tests/test_dataframe.py | 44 ++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 93c68387bed..0b73f32e94d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8595,7 +8595,7 @@ def test_dataframe_init_from_series(data, columns, index): ({}, True), ], ) -def test_is_homogeneous(data, expected): +def test_is_homogeneous_dataframe(data, expected): actual = cudf.DataFrame(data)._is_homogeneous assert actual == expected @@ -8631,8 +8631,48 @@ def test_is_homogeneous(data, expected): ), ], ) -def test_is_homogeneous_multiindex(data, indexes, expected): +def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected): test_dataframe = cudf.DataFrame(data).set_index(indexes) actual = cudf.DataFrame(test_dataframe)._is_homogeneous assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", [([1, 2, 3, 4], True), ([True, False], True)] +) +def test_is_homogeneous_series(data, expected): + actual = cudf.Series(data)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "levels, codes, expected", + [ + ( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + True, + ), + ( + [[1, 2, 3], [True, False, True]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + False, + ), + ], +) +def test_is_homogeneous_multiIndex(levels, codes, expected): + actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", + [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)], +) +def test_is_homogeneous_index(data, expected): + actual = cudf.Index(data)._is_homogeneous + + assert actual == expected