From 2c5a2ad842f1d412c2b8afc86eb49be8c1e0c681 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 13 Aug 2021 03:08:35 -0500
Subject: [PATCH] Upgrade `arrow` & `pyarrow` to `5.0.0` (#8908)

This PR upgrades arrow to `5.0.0`.

- [x] Upgrade & test arrow 5.0.0.
- [x] Fix pytest failures related to decimal arrays.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ashwin Srinath (https://github.com/shwina)
  - Mark Sadang (https://github.com/msadang)
  - Dillon Cullinan (https://github.com/dillon-cullinan)

URL: https://github.com/rapidsai/cudf/pull/8908
---
 conda/environments/cudf_dev_cuda11.0.yml |  4 +--
 conda/environments/cudf_dev_cuda11.2.yml |  4 +--
 conda/recipes/cudf/meta.yaml             |  2 +-
 conda/recipes/libcudf/meta.yaml          |  2 +-
 cpp/cmake/thirdparty/CUDF_GetArrow.cmake |  2 +-
 python/cudf/cudf/core/column/column.py   | 44 +++++++++++++-----------
 python/cudf/cudf/tests/test_binops.py    | 40 ++++++++++-----------
 python/cudf/cudf/tests/test_decimal.py   | 10 +++++-
 8 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 692ebe71794..2c0984569db 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.1=*cuda
+  - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -42,7 +42,7 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
-  - arrow-cpp=4.0.1
+  - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
   - arrow-cpp-proc * cuda
   - double-conversion
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index ce82b870e16..766d85e957b 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.1=*cuda
+  - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -42,7 +42,7 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
-  - arrow-cpp=4.0.1
+  - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
   - arrow-cpp-proc * cuda
   - double-conversion
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9023e89c2f5..ca36acccfbb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - numba >=0.53.1
     - dlpack>=0.5,<0.6.0a0
-    - pyarrow 4.0.1 *cuda
+    - pyarrow 5.0.0 *cuda
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 88065ef49e0..c1ba2b495eb 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 4.0.1 *cuda
+    - arrow-cpp 5.0.0 *cuda
     - arrow-cpp-proc * cuda
     - dlpack>=0.5,<0.6.0a0
   run:
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 5f6ff9651a2..38a5d8da44a 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -177,7 +177,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 4.0.1)
+set(CUDF_VERSION_Arrow 5.0.0)
 
 find_and_configure_arrow(
     ${CUDF_VERSION_Arrow}
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8aeaf08273f..b95a4495a69 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2016,6 +2016,29 @@ def as_column(
                 memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
             )
         except TypeError:
+            if dtype is not None:
+                # Arrow throws a type error if the input is of
+                # mixed-precision and cannot fit into the provided
+                # decimal type properly, see:
+                # https://github.com/apache/arrow/pull/9948
+                # Hence we should let the exception propagate to
+                # the user.
+                if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                    data = pa.array(
+                        arbitrary,
+                        type=pa.decimal128(
+                            precision=dtype.precision, scale=dtype.scale
+                        ),
+                    )
+                    return cudf.core.column.Decimal64Column.from_arrow(data)
+                if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                    data = pa.array(
+                        arbitrary,
+                        type=pa.decimal128(
+                            precision=dtype.precision, scale=dtype.scale
+                        ),
+                    )
+                    return cudf.core.column.Decimal32Column.from_arrow(data)
             pa_type = None
             np_type = None
             try:
@@ -2034,26 +2057,7 @@ def as_column(
                     ) and not isinstance(dtype, cudf.IntervalDtype):
                         data = pa.array(arbitrary, type=dtype.to_arrow())
                         return as_column(data, nan_as_null=nan_as_null)
-                    if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                        data = pa.array(
-                            arbitrary,
-                            type=pa.decimal128(
-                                precision=dtype.precision, scale=dtype.scale
-                            ),
-                        )
-                        return cudf.core.column.Decimal64Column.from_arrow(
-                            data
-                        )
-                    if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                        data = pa.array(
-                            arbitrary,
-                            type=pa.decimal128(
-                                precision=dtype.precision, scale=dtype.scale
-                            ),
-                        )
-                        return cudf.core.column.Decimal32Column.from_arrow(
-                            data
-                        )
+
                     dtype = pd.api.types.pandas_dtype(dtype)
                     np_type = np.dtype(dtype).type
                     if np_type == np.bool_:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 8277b8e7b32..f8fd2502a7d 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1758,16 +1758,16 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.add,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
-            ["3.0", "4.0"],
             cudf.Decimal64Dtype(scale=2, precision=3),
+            ["3.0", "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
         ),
         (
             operator.add,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", "3.005"],
@@ -1785,7 +1785,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
@@ -1794,7 +1794,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
@@ -1812,11 +1812,11 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.mul,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", "3.0"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", "6.0"],
-            cudf.Decimal64Dtype(scale=5, precision=7),
+            cudf.Decimal64Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -1866,16 +1866,16 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.add,
             ["1.5", None, "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["1.5", None, "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["3.0", None, "4.0"],
-            cudf.Decimal64Dtype(scale=2, precision=3),
+            cudf.Decimal64Dtype(scale=1, precision=3),
         ),
         (
             operator.add,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", None],
@@ -1884,7 +1884,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
@@ -1893,7 +1893,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
@@ -1902,11 +1902,11 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.mul,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", None],
-            cudf.Decimal64Dtype(scale=5, precision=7),
+            cudf.Decimal64Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -2432,10 +2432,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
         (
             operator.truediv,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=2, precision=4),
+            cudf.Decimal64Dtype(scale=2, precision=5),
             decimal.Decimal(2),
             ["50", "100"],
-            cudf.Decimal64Dtype(scale=2, precision=6),
+            cudf.Decimal64Dtype(scale=2, precision=7),
             False,
         ),
         (
@@ -2459,10 +2459,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
         (
             operator.truediv,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=2, precision=3),
+            cudf.Decimal64Dtype(scale=2, precision=5),
             1,
             ["0", "0"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal64Dtype(scale=-2, precision=7),
             True,
         ),
         (
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index d2de44b0c8f..9d93898dcd9 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -24,7 +24,7 @@
     [1],
     [-1],
     [1, 2, 3, 4],
-    [42, 1729, 4104],
+    [42, 17, 41],
     [1, 2, None, 4],
     [None, None, None],
     [],
@@ -347,3 +347,11 @@ def test_serialize_decimal_columns(data):
     df = cudf.DataFrame(data)
     recreated = df.__class__.deserialize(*df.serialize())
     assert_eq(recreated, df)
+
+
+def test_decimal_invalid_precision():
+    with pytest.raises(pa.ArrowInvalid):
+        _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2))
+
+    with pytest.raises(pa.ArrowInvalid):
+        _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))