From 02d1de2122930c379e9ba88dbf13bf8cdc5b3acb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 25 Feb 2022 15:35:25 -0600 Subject: [PATCH 1/3] Fix warning related to integer overflow in pandas. --- python/cudf/cudf/tests/test_csv.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index f3d69e1745e..b0fcf6be98f 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -4,6 +4,7 @@ import os import re import shutil +import warnings from collections import OrderedDict from io import BytesIO, StringIO from pathlib import Path @@ -1322,7 +1323,18 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): if gdf_dtype is not None: # require explicit `hex` dtype to parse hexadecimals - pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ( + "Values are too large to be losslessly cast to int32. In " + "a future version this will raise OverflowError." + ), + category=FutureWarning, + ) + pdf = pd.DataFrame( + data=values, dtype=pdf_dtype, columns=["hex_int"] + ) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) np.testing.assert_array_equal( pdf["hex_int"], gdf["hex_int"].to_numpy() From e015b2313ae231b8315a082a7801d9805d30cd97 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 28 Feb 2022 14:44:56 -0600 Subject: [PATCH 2/3] Refactor tests of overflowing hexadecimal values. --- python/cudf/cudf/tests/test_csv.py | 46 +++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b0fcf6be98f..7dd735997d6 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -4,7 +4,6 @@ import os import re import shutil -import warnings from collections import OrderedDict from io import BytesIO, StringIO from pathlib import Path @@ -1316,25 +1315,14 @@ def test_csv_reader_aligned_byte_range(tmpdir): [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], ) def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): - lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF", "9512c20b"] + lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"] values = [int(hex_int, 16) for hex_int in lines] buffer = "\n".join(lines) if gdf_dtype is not None: # require explicit `hex` dtype to parse hexadecimals - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - ( - "Values are too large to be losslessly cast to int32. In " - "a future version this will raise OverflowError." - ), - category=FutureWarning, - ) - pdf = pd.DataFrame( - data=values, dtype=pdf_dtype, columns=["hex_int"] - ) + pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) np.testing.assert_array_equal( pdf["hex_int"], gdf["hex_int"].to_numpy() @@ -1346,6 +1334,36 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): assert_eq(pdf, gdf) +@pytest.mark.parametrize( + "np_dtype, gdf_dtype", + [("int_", "hex"), ("int32", "hex32"), ("int64", "hex64")], +) +def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): + # This tests values which cause an overflow warning that will become an + # error in pandas. NumPy wraps the overflow silently up to the bounds of a + # signed int64. + lines = [ + "0x0", + "-0x1000", + "0xfedcba", + "0xABCDEF", + "0xaBcDeF", + "0x9512c20b", + "0x7fffffff", + "0x7fffffffffffffff", + "-0x8000000000000000", + ] + values = [int(hex_int, 16) for hex_int in lines] + buffer = "\n".join(lines) + + gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) + + np_dtype = getattr(np, np_dtype) + expected = np.array(values, dtype=np_dtype) + actual = gdf["hex_int"].to_numpy() + np.testing.assert_array_equal(expected, actual) + + @pytest.mark.parametrize("quoting", [0, 1, 2, 3]) def test_csv_reader_pd_consistent_quotes(quoting): names = ["text"] From d7045b1fbe0f1c571eebda5473ba895ef2b9ccef Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 1 Mar 2022 09:44:56 -0600 Subject: [PATCH 3/3] Use string as dtype. --- python/cudf/cudf/tests/test_csv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 7dd735997d6..6176184b670 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1336,7 +1336,7 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): @pytest.mark.parametrize( "np_dtype, gdf_dtype", - [("int_", "hex"), ("int32", "hex32"), ("int64", "hex64")], + [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], ) def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): # This tests values which cause an overflow warning that will become an @@ -1358,7 +1358,6 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - np_dtype = getattr(np, np_dtype) expected = np.array(values, dtype=np_dtype) actual = gdf["hex_int"].to_numpy() np.testing.assert_array_equal(expected, actual)