Skip to content

Commit

Permalink
Upgrade arrow to 4.0.1 (#7495)
Browse files Browse the repository at this point in the history
Fixes: #7224

This PR:

- [x] Adds support for arrow 4.0.1 in cudf.
- [x] Moves testing-related utilities to `cudf.testing` module.
- [x] Fixes miscellaneous errors related to arrow upgrade.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Jeremy Dyer (https://github.com/jdye64)
  - Paul Taylor (https://github.com/trxcllnt)
  - Dillon Cullinan (https://github.com/dillon-cullinan)
  - Devavret Makkar (https://github.com/devavret)
  - Keith Kraus (https://github.com/kkraus14)
  - Michael Wang (https://github.com/isVoid)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #7495
  • Loading branch information
galipremsagar authored Jun 29, 2021
1 parent 0206fc9 commit 1e53776
Show file tree
Hide file tree
Showing 102 changed files with 359 additions and 338 deletions.
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ fi

cd "$WORKSPACE/python/cudf"
gpuci_logger "Python py.test for cuDF"
py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term

cd "$WORKSPACE/python/dask_cudf"
gpuci_logger "Python py.test for dask-cudf"
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- numba>=0.53.1
- numpy
- pandas>=1.0,<1.3.0dev0
- pyarrow=1.0.1
- pyarrow=4.0.1
- fastavro>=0.22.9
- notebook>=0.5.0
- cython>=0.29,<0.30
Expand All @@ -44,8 +44,8 @@ dependencies:
- dask>=2021.6.0
- distributed>=2021.6.0
- streamz
- arrow-cpp=4.0.1
- dlpack>=0.5,<0.6.0a0
- arrow-cpp=1.0.1
- arrow-cpp-proc * cuda
- double-conversion
- rapidjson
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/cudf_dev_cuda11.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- numba>=0.53.1
- numpy
- pandas>=1.0,<1.3.0dev0
- pyarrow=1.0.1
- pyarrow=4.0.1
- fastavro>=0.22.9
- notebook>=0.5.0
- cython>=0.29,<0.30
Expand All @@ -44,8 +44,8 @@ dependencies:
- dask>=2021.6.0
- distributed>=2021.6.0
- streamz
- arrow-cpp=4.0.1
- dlpack>=0.5,<0.6.0a0
- arrow-cpp=1.0.1
- arrow-cpp-proc * cuda
- double-conversion
- rapidjson
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ requirements:
- setuptools
- numba >=0.53.1
- dlpack>=0.5,<0.6.0a0
- pyarrow 1.0.1
- pyarrow 4.0.1
- libcudf {{ version }}
- rmm {{ minor_version }}
- cudatoolkit {{ cuda_version }}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ requirements:
host:
- librmm {{ minor_version }}.*
- cudatoolkit {{ cuda_version }}.*
- arrow-cpp 1.0.1
- arrow-cpp 4.0.1
- arrow-cpp-proc * cuda
- dlpack>=0.5,<0.6.0a0
run:
Expand Down
4 changes: 2 additions & 2 deletions conda/recipes/libcudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ requirements:
build:
- cmake >=3.20.1
host:
- libcudf {{ version }}
- librdkafka >=1.5.0,<1.5.3
- libcudf {{version}}
- librdkafka >=1.6.0,<1.7.0a0
run:
- {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/CUDF_GetArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3)

endfunction()

set(CUDF_VERSION_Arrow 1.0.1)
set(CUDF_VERSION_Arrow 4.0.1)

find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3})
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
pandas_to_avro,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.testing import dataset_generator as dg

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
_generate_rand_meta,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.testing import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes

logging.basicConfig(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
_generate_rand_meta,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.testing import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes

logging.basicConfig(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
pandas_to_orc,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.testing import dataset_generator as dg

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
_generate_rand_meta,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.testing import dataset_generator as dg

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
compare_content,
run_test,
)
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq


@pythonfuzz(data_handle=CSVReader)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from cudf._fuzz_testing.json import JSONReader, JSONWriter
from cudf._fuzz_testing.main import pythonfuzz
from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq


@pythonfuzz(data_handle=JSONReader)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pyorc

import cudf
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq
from cudf.utils.dtypes import (
pandas_dtypes_to_cudf_dtypes,
pyarrow_dtypes_to_pandas_dtypes,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/gpuarrow.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ from pyarrow.includes.libarrow cimport (
CRecordBatchStreamReader
)
from pyarrow.lib cimport (
_CRecordBatchReader,
RecordBatchReader,
Buffer,
Schema,
pyarrow_wrap_schema
)
import pyarrow as pa


cdef class CudaRecordBatchStreamReader(_CRecordBatchReader):
cdef class CudaRecordBatchStreamReader(RecordBatchReader):
cdef:
CIpcReadOptions options

Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,16 @@ cpdef generate_pandas_metadata(Table table, index):
index_descriptors.append(descr)

metadata = pa.pandas_compat.construct_metadata(
table,
col_names,
index_levels,
index_descriptors,
index,
types,
columns_to_convert=[
col
for col in table._columns
],
df=table,
column_names=col_names,
index_levels=index_levels,
index_descriptors=index_descriptors,
preserve_index=index,
types=types,
)

md_dict = json.loads(metadata[b"pandas"])
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5652,11 +5652,12 @@ def to_arrow(self, preserve_index=True):

out = super(DataFrame, data).to_arrow()
metadata = pa.pandas_compat.construct_metadata(
self,
out.schema.names,
[self.index],
index_descr,
preserve_index,
columns_to_convert=[self[col] for col in self._data.names],
df=self,
column_names=out.schema.names,
index_levels=[self.index],
index_descriptors=index_descr,
preserve_index=preserve_index,
types=out.schema.types,
)

Expand Down
File renamed without changes.
File renamed without changes.
Empty file removed python/cudf/cudf/tests/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_apply_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import cudf
from cudf.core.column import column
from cudf.tests.utils import assert_eq, gen_rand_series
from cudf.testing._utils import assert_eq, gen_rand_series


def _kernel_multiply(a, b, out):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_applymap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest

from cudf import Series
from cudf.tests import utils
from cudf.testing import _utils as utils


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_array_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import cudf
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq
from cudf.utils.utils import IS_NEP18_ACTIVE

missing_arrfunc_cond = not IS_NEP18_ACTIVE
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/tests/test_array_ufunc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import cudf
import numpy as np
import cupy as cp
import numpy as np
import pandas as pd
import pytest
from cudf.tests.utils import assert_eq

import cudf
from cudf.testing._utils import assert_eq


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest

import cudf
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq


def cudf_from_avro_util(schema, records):
Expand Down
20 changes: 7 additions & 13 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import cudf
from cudf.core import Series
from cudf.core.index import as_index
from cudf.tests import utils
from cudf.testing import _utils as utils
from cudf.utils.dtypes import (
BOOL_TYPES,
DATETIME_TYPES,
Expand Down Expand Up @@ -1742,12 +1742,6 @@ def test_binops_with_NA_consistent(dtype, op):
assert result._column.null_count == len(data)


def _decimal_series(input, dtype):
return cudf.Series(
[x if x is None else decimal.Decimal(x) for x in input], dtype=dtype,
)


@pytest.mark.parametrize(
"args",
[
Expand Down Expand Up @@ -2080,10 +2074,10 @@ def _decimal_series(input, dtype):
def test_binops_decimal(args):
op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args

a = _decimal_series(lhs, l_dtype)
b = _decimal_series(rhs, r_dtype)
a = utils._decimal_series(lhs, l_dtype)
b = utils._decimal_series(rhs, r_dtype)
expect = (
_decimal_series(expect, expect_dtype)
utils._decimal_series(expect, expect_dtype)
if isinstance(expect_dtype, cudf.Decimal64Dtype)
else cudf.Series(expect, dtype=expect_dtype)
)
Expand Down Expand Up @@ -2242,7 +2236,7 @@ def test_binops_decimal(args):
),
],
)
@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES)
@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES)
@pytest.mark.parametrize("reflected", [True, False])
def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
"""
Expand All @@ -2258,7 +2252,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
else:
op, ldata, ldtype, rdata, _, expected = args

lhs = _decimal_series(ldata, ldtype)
lhs = utils._decimal_series(ldata, ldtype)
rhs = cudf.Series(rdata, dtype=integer_dtype)

if reflected:
Expand Down Expand Up @@ -2746,7 +2740,7 @@ def test_binops_decimal_scalar_compare(args, reflected):
else:
op, ldata, ldtype, rdata, _, expected = args

lhs = _decimal_series(ldata, ldtype)
lhs = utils._decimal_series(ldata, ldtype)
rhs = rdata

if reflected:
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@

import cudf
from cudf.core._compat import PANDAS_GE_110
from cudf.tests.utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal
from cudf.testing._utils import (
NUMERIC_TYPES,
assert_eq,
assert_exceptions_equal,
)


@pytest.fixture
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import cudf
from cudf._lib.transform import mask_to_bools
from cudf.core.column.column import as_column
from cudf.tests.utils import assert_eq, assert_exceptions_equal
from cudf.testing._utils import assert_eq, assert_exceptions_equal
from cudf.utils import dtypes as dtypeutils

dtypes = sorted(
Expand Down Expand Up @@ -140,8 +140,8 @@ def test_column_series_multi_dim(data):
@pytest.mark.parametrize(
("data", "error"),
[
([1, "1.0", "2", -3], TypeError),
([np.nan, 0, "null", cp.nan], TypeError),
([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
(
[np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
None,
Expand All @@ -152,7 +152,7 @@ def test_column_mixed_dtype(data, error):
if error is None:
cudf.Series(data)
else:
with pytest.raises(TypeError):
with pytest.raises(error):
cudf.Series(data)


Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cudf
from cudf.core.column_accessor import ColumnAccessor
from cudf.tests.utils import assert_eq
from cudf.testing._utils import assert_eq

simple_test_data = [
{},
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import re
from decimal import Decimal

import numpy as np
import pandas as pd
import pytest
from decimal import Decimal

import cudf as gd
from cudf.tests.utils import assert_eq, assert_exceptions_equal
from cudf.utils.dtypes import is_categorical_dtype
from cudf.core.dtypes import Decimal64Dtype
from cudf.testing._utils import assert_eq, assert_exceptions_equal
from cudf.utils.dtypes import is_categorical_dtype


def make_frames(index=None, nulls="none"):
Expand Down
Loading

0 comments on commit 1e53776

Please sign in to comment.