From 2c5a2ad842f1d412c2b8afc86eb49be8c1e0c681 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Aug 2021 03:08:35 -0500 Subject: [PATCH] Upgrade `arrow` & `pyarrow` to `5.0.0` (#8908) This PR upgrades arrow to `5.0.0`. - [x] Upgrade & test arrow 5.0.0. - [x] Fix pytest failures related to decimal arrays. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Ashwin Srinath (https://github.com/shwina) - Mark Sadang (https://github.com/msadang) - Dillon Cullinan (https://github.com/dillon-cullinan) URL: https://github.com/rapidsai/cudf/pull/8908 --- conda/environments/cudf_dev_cuda11.0.yml | 4 +-- conda/environments/cudf_dev_cuda11.2.yml | 4 +-- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 2 +- python/cudf/cudf/core/column/column.py | 44 +++++++++++++----------- python/cudf/cudf/tests/test_binops.py | 40 ++++++++++----------- python/cudf/cudf/tests/test_decimal.py | 10 +++++- 8 files changed, 60 insertions(+), 48 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 692ebe71794..2c0984569db 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.1=*cuda + - pyarrow=5.0.0=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -42,7 +42,7 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz - - arrow-cpp=4.0.1 + - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index ce82b870e16..766d85e957b 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=4.0.1=*cuda + - pyarrow=5.0.0=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -42,7 +42,7 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz - - arrow-cpp=4.0.1 + - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 9023e89c2f5..ca36acccfbb 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -30,7 +30,7 @@ requirements: - setuptools - numba >=0.53.1 - dlpack>=0.5,<0.6.0a0 - - pyarrow 4.0.1 *cuda + - pyarrow 5.0.0 *cuda - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 88065ef49e0..c1ba2b495eb 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,7 +37,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 4.0.1 *cuda + - arrow-cpp 5.0.0 *cuda - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 5f6ff9651a2..38a5d8da44a 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -177,7 +177,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endfunction() -set(CUDF_VERSION_Arrow 4.0.1) +set(CUDF_VERSION_Arrow 5.0.0) find_and_configure_arrow( ${CUDF_VERSION_Arrow} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8aeaf08273f..b95a4495a69 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2016,6 +2016,29 @@ def as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: + if dtype is not None: + # Arrow throws a type error if the input is of + # mixed-precision and cannot fit into the provided + # decimal type properly, see: + # https://github.com/apache/arrow/pull/9948 + # Hence we should let the exception propagate to + # the user. + if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal64Column.from_arrow(data) + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + data = pa.array( + arbitrary, + type=pa.decimal128( + precision=dtype.precision, scale=dtype.scale + ), + ) + return cudf.core.column.Decimal32Column.from_arrow(data) pa_type = None np_type = None try: @@ -2034,26 +2057,7 @@ def as_column( ) and not isinstance(dtype, cudf.IntervalDtype): data = pa.array(arbitrary, type=dtype.to_arrow()) return as_column(data, nan_as_null=nan_as_null) - if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal64Column.from_arrow( - data - ) - if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal32Column.from_arrow( - data - ) + dtype = pd.api.types.pandas_dtype(dtype) np_type = np.dtype(dtype).type if np_type == np.bool_: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 8277b8e7b32..f8fd2502a7d 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1758,16 +1758,16 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.add, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), - ["3.0", "4.0"], cudf.Decimal64Dtype(scale=2, precision=3), + ["3.0", "4.0"], + cudf.Decimal64Dtype(scale=2, precision=4), ), ( operator.add, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["3.75", "3.005"], @@ -1785,7 +1785,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", "0.995"], @@ -1794,7 +1794,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", "0.995"], @@ -1812,11 +1812,11 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.mul, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", "3.0"], cudf.Decimal64Dtype(scale=3, precision=4), ["2.25", "6.0"], - cudf.Decimal64Dtype(scale=5, precision=7), + cudf.Decimal64Dtype(scale=5, precision=8), ), ( operator.mul, @@ -1866,16 +1866,16 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.add, ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=1, precision=2), ["3.0", None, "4.0"], - cudf.Decimal64Dtype(scale=2, precision=3), + cudf.Decimal64Dtype(scale=1, precision=3), ), ( operator.add, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", "1.005"], cudf.Decimal64Dtype(scale=3, precision=4), ["3.75", None], @@ -1884,7 +1884,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", None], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", None], @@ -1893,7 +1893,7 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.sub, ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["2.25", None], cudf.Decimal64Dtype(scale=3, precision=4), ["-0.75", None], @@ -1902,11 +1902,11 @@ def test_binops_with_NA_consistent(dtype, op): ( operator.mul, ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=2), + cudf.Decimal64Dtype(scale=2, precision=3), ["1.5", None], cudf.Decimal64Dtype(scale=3, precision=4), ["2.25", None], - cudf.Decimal64Dtype(scale=5, precision=7), + cudf.Decimal64Dtype(scale=5, precision=8), ), ( operator.mul, @@ -2432,10 +2432,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): ( operator.truediv, ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=4), + cudf.Decimal64Dtype(scale=2, precision=5), decimal.Decimal(2), ["50", "100"], - cudf.Decimal64Dtype(scale=2, precision=6), + cudf.Decimal64Dtype(scale=2, precision=7), False, ), ( @@ -2459,10 +2459,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): ( operator.truediv, ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=3), + cudf.Decimal64Dtype(scale=2, precision=5), 1, ["0", "0"], - cudf.Decimal64Dtype(scale=-2, precision=5), + cudf.Decimal64Dtype(scale=-2, precision=7), True, ), ( diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index d2de44b0c8f..9d93898dcd9 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -24,7 +24,7 @@ [1], [-1], [1, 2, 3, 4], - [42, 1729, 4104], + [42, 17, 41], [1, 2, None, 4], [None, None, None], [], @@ -347,3 +347,11 @@ def test_serialize_decimal_columns(data): df = cudf.DataFrame(data) recreated = df.__class__.deserialize(*df.serialize()) assert_eq(recreated, df) + + +def test_decimal_invalid_precision(): + with pytest.raises(pa.ArrowInvalid): + _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) + + with pytest.raises(pa.ArrowInvalid): + _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))