Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade arrow to 16 #15703

Merged
merged 10 commits into from
May 9, 2024
Merged
12 changes: 6 additions & 6 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libcufile-dev=1.4.0.31
- libcufile=1.4.0.31
- libcurand-dev=10.3.0.86
- libcurand=10.3.0.86
- libkvikio==24.6.*
- libparquet==14.0.2.*
- libparquet==16.0.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.6.*
- make
Expand All @@ -66,7 +66,7 @@ dependencies:
- pip
- pre-commit
- ptxcompiler
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- pydata-sphinx-theme!=0.14.2
- pytest-benchmark
- pytest-cases>=3.8.2
Expand All @@ -92,7 +92,7 @@ dependencies:
- streamz
- sysroot_linux-64==2.17
- tokenizers==0.15.2
- transformers==4.38.1
- transformers==4.39.3
- typing_extensions>=4.0.0
- zlib>=1.2.13
- pip:
Expand Down
12 changes: 6 additions & 6 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libcufile-dev
- libcurand-dev
- libkvikio==24.6.*
- libparquet==14.0.2.*
- libparquet==16.0.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.6.*
- make
Expand All @@ -63,7 +63,7 @@ dependencies:
- pandoc
- pip
- pre-commit
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink
- pytest-benchmark
Expand All @@ -90,7 +90,7 @@ dependencies:
- streamz
- sysroot_linux-64==2.17
- tokenizers==0.15.2
- transformers==4.38.1
- transformers==4.39.3
- typing_extensions>=4.0.0
- zlib>=1.2.13
- pip:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ requirements:
- setuptools
- dlpack >=0.8,<1.0
- numpy 1.23
- pyarrow ==14.0.2.*
- pyarrow ==16.0.0.*
- libcudf ={{ version }}
- rmm ={{ minor_version }}
{% if cuda_major == "11" %}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ cmake_version:
- ">=3.26.4"

libarrow_version:
- "==14.0.2"
- "==16.0.0"

dlpack_version:
- ">=0.8,<1.0"
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
14.0.2
16.0.0
CACHE STRING "The version of Arrow to find (or build)"
)
endif()
Expand Down
24 changes: 11 additions & 13 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ dependencies:
- cython>=3.0.3
# Hard pin the patch version used during the build. This must be kept
# in sync with the version pinned in get_arrow.cmake.
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- output_types: conda
packages:
- scikit-build-core>=0.7.0
Expand Down Expand Up @@ -312,27 +312,25 @@ dependencies:
packages:
# Hard pin the Arrow patch version used during the build. This must
# be kept in sync with the version pinned in get_arrow.cmake.
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libparquet==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libparquet==16.0.0.*
libarrow_run:
common:
- output_types: conda
packages:
# Allow runtime version to float up to minor version
# Disallow libarrow 14.0.0 due to a CVE
- libarrow-acero>=14.0.1,<15.0.0a0
- libarrow-dataset>=14.0.1,<15.0.0a0
- libarrow>=14.0.1,<15.0.0a0
- libparquet>=14.0.1,<15.0.0a0
- libarrow-acero>=16.0.0,<17.0.0a0
- libarrow-dataset>=16.0.0,<17.0.0a0
- libarrow>=16.0.0,<17.0.0a0
- libparquet>=16.0.0,<17.0.0a0
pyarrow_run:
common:
- output_types: [conda, requirements, pyproject]
packages:
# Allow runtime version to float up to minor version
# Disallow pyarrow 14.0.0 due to a CVE
- pyarrow>=14.0.1,<15.0.0a0
- pyarrow>=16.0.0,<17.0.0a0
cuda_version:
specific:
- output_types: conda
Expand Down Expand Up @@ -631,7 +629,7 @@ dependencies:
packages:
- msgpack
- &tokenizers tokenizers==0.15.2
- &transformers transformers==4.38.1
- &transformers transformers==4.39.3
- tzdata
specific:
- output_types: conda
Expand Down
5 changes: 0 additions & 5 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,15 +993,10 @@ def to_parquet(
if index is None:
index = True

# Convert partition_file_name to a call back
if partition_file_name:
partition_file_name = lambda x: partition_file_name # noqa: E731

pa_table = df.to_arrow(preserve_index=index)
return pq.write_to_dataset(
pa_table,
root_path=path,
partition_filename_cb=partition_file_name,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to warn or raise if the user specifies partition_file_name and the engine is not "cudf"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might get very noisy because we have many such parameters, I documented the change.

partition_cols=partition_cols,
*args,
**kwargs,
Expand Down
Binary file modified python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
Binary file not shown.
14 changes: 1 addition & 13 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
]
pa_chunk_array = pa.chunked_array(np_list_data)

expect = pd.Series(pa_chunk_array.to_pandas())
if cudf.api.types.is_datetime64_dtype(
data_type
) or cudf.api.types.is_timedelta64_dtype(data_type):
# Workaround for an Arrow Bug:
# https://github.com/apache/arrow/issues/34462
expect = expect.astype(data_type)
expect = pa_chunk_array.to_pandas()
got = cudf.Series(pa_chunk_array)

assert_eq(expect, got)
Expand All @@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
)

expect = pa_table.to_pandas()
if cudf.api.types.is_datetime64_dtype(
data_type
) or cudf.api.types.is_timedelta64_dtype(data_type):
# Workaround for an Arrow Bug:
# https://github.com/apache/arrow/issues/34462
expect = expect.astype(data_type)
got = cudf.DataFrame.from_arrow(pa_table)

assert_eq(expect, got)
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
arrow_array = pa.Array.from_pandas(pdi)
expected_index = pd.Index(arrow_array.to_pandas())
gdi = cudf.Index.from_arrow(arrow_array)
if gdi.dtype == cudf.dtype("datetime64[s]"):
# Arrow bug:
# https://github.com/apache/arrow/issues/33321
# arrow cannot convert non-nanosecond
# resolution to appropriate type in pandas.
# Hence need to type-cast.
expected_index = expected_index.astype(gdi.dtype)

assert_eq(expected_index, gdi)


Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
# Because of this, we aren't using PyArrow as a reference for testing our
# row-group selection method since the only way to only select row groups
# with PyArrow is with the method we use and intend to test.
tbl_filtered = pq.read_table(
fname, filters=[("1", ">", 60)], use_legacy_dataset=False
)
tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])

assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
print(len(df_filtered))
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,8 @@
File name to use for partitioned datasets. Different partitions
will be written to different directories, but all files will
have this name. If nothing is specified, a random uuid4 hex string
will be used for each file.
will be used for each file. This parameter is only supported by 'cudf'
engine, and will be ignored by other engines.
partition_offsets : list, optional, default None
Offsets to partition the dataframe by. Should be used when path is list
of str. Should be a list of integers of size ``len(path) + 1``
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires = [
"cython>=3.0.3",
"ninja",
"numpy==1.23.*",
"pyarrow==14.0.2.*",
"pyarrow==16.0.0.*",
"rmm==24.6.*",
"scikit-build-core[pyproject]>=0.7.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
Expand All @@ -34,7 +34,7 @@ dependencies = [
"packaging",
"pandas>=2.0,<2.2.3dev0",
"ptxcompiler",
"pyarrow>=14.0.1,<15.0.0a0",
"pyarrow>=16.0.0,<17.0.0a0",
"rich",
"rmm==24.6.*",
"typing_extensions>=4.0.0",
Expand Down Expand Up @@ -63,7 +63,7 @@ test = [
"pytest<8",
"scipy",
"tokenizers==0.15.2",
"transformers==4.38.1",
"transformers==4.39.3",
"tzdata",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
pandas-tests = [
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_kafka/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires = [
"cython>=3.0.3",
"ninja",
"numpy==1.23.*",
"pyarrow==14.0.2.*",
"pyarrow==16.0.0.*",
"scikit-build-core[pyproject]>=0.7.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

Expand Down
4 changes: 1 addition & 3 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
pdf = ddf2.compute()
pdf.to_parquet(fn, engine="pyarrow")
read_df = dask_cudf.read_parquet(fn)
# Workaround until following issue is fixed:
# https://github.com/apache/arrow/issues/33321
dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
dd.assert_eq(ddf2, read_df.compute())


@pytest.mark.parametrize("index", [False, None])
Expand Down
Loading