diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 7a5fef9f25e..48699b81eed 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -36,15 +36,15 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -66,7 +66,7 @@ dependencies: - pip - pre-commit - ptxcompiler -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 @@ -92,7 +92,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 48453e18bb0..d06a727f331 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -37,13 +37,13 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev - libcurand-dev - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -63,7 +63,7 @@ dependencies: - pandoc - pip - pre-commit -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pynvjitlink - pytest-benchmark @@ -90,7 +90,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index ddcadfd1570..24210830ada 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -65,7 +65,7 @@ requirements: - setuptools - dlpack >=0.8,<1.0 - numpy 1.23 - - pyarrow ==14.0.2.* + - pyarrow ==16.0.0.* - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index ba5e96fb6cf..61ffcf3c3de 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -20,7 +20,7 @@ cmake_version: - ">=3.26.4" libarrow_version: - - "==14.0.2" + - "==16.0.0" dlpack_version: - ">=0.8,<1.0" diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 892056959c8..70283efbd79 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 14.0.2 + 16.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/dependencies.yaml b/dependencies.yaml index 1508656471d..7fe67817f73 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -266,7 +266,7 @@ dependencies: - cython>=3.0.3 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - pyarrow==14.0.2.* + - pyarrow==16.0.0.* - output_types: conda packages: - scikit-build-core>=0.7.0 @@ -312,27 +312,25 @@ dependencies: packages: # Hard pin the Arrow patch version used during the build. This must # be kept in sync with the version pinned in get_arrow.cmake. - - libarrow-acero==14.0.2.* - - libarrow-dataset==14.0.2.* - - libarrow==14.0.2.* - - libparquet==14.0.2.* + - libarrow-acero==16.0.0.* + - libarrow-dataset==16.0.0.* + - libarrow==16.0.0.* + - libparquet==16.0.0.* libarrow_run: common: - output_types: conda packages: # Allow runtime version to float up to minor version - # Disallow libarrow 14.0.0 due to a CVE - - libarrow-acero>=14.0.1,<15.0.0a0 - - libarrow-dataset>=14.0.1,<15.0.0a0 - - libarrow>=14.0.1,<15.0.0a0 - - libparquet>=14.0.1,<15.0.0a0 + - libarrow-acero>=16.0.0,<17.0.0a0 + - libarrow-dataset>=16.0.0,<17.0.0a0 + - libarrow>=16.0.0,<17.0.0a0 + - libparquet>=16.0.0,<17.0.0a0 pyarrow_run: common: - output_types: [conda, requirements, pyproject] packages: # Allow runtime version to float up to minor version - # Disallow pyarrow 14.0.0 due to a CVE - - pyarrow>=14.0.1,<15.0.0a0 + - pyarrow>=16.0.0,<17.0.0a0 cuda_version: specific: - output_types: conda @@ -631,7 +629,7 @@ dependencies: packages: - msgpack - &tokenizers tokenizers==0.15.2 - - &transformers transformers==4.38.1 + - &transformers transformers==4.39.3 - tzdata specific: - output_types: conda diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..dd1e59acaaa 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -993,15 +993,10 @@ def to_parquet( if index is None: index = True - # Convert partition_file_name to a call back - if partition_file_name: - partition_file_name = lambda x: partition_file_name # noqa: E731 - pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, root_path=path, - partition_filename_cb=partition_file_name, partition_cols=partition_cols, *args, **kwargs, diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet index 20ef3cc5578..efde6ff11bf 100644 Binary files a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet and b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet differ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 20e9f41de63..8550bc91253 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ] pa_chunk_array = pa.chunked_array(np_list_data) - expect = pd.Series(pa_chunk_array.to_pandas()) - if cudf.api.types.is_datetime64_dtype( - data_type - ) or cudf.api.types.is_timedelta64_dtype(data_type): - # Workaround for an Arrow Bug: - # https://github.com/apache/arrow/issues/34462 - expect = expect.astype(data_type) + expect = pa_chunk_array.to_pandas() got = cudf.Series(pa_chunk_array) assert_eq(expect, got) @@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): ) expect = pa_table.to_pandas() - if cudf.api.types.is_datetime64_dtype( - data_type - ) or cudf.api.types.is_timedelta64_dtype(data_type): - # Workaround for an Arrow Bug: - # https://github.com/apache/arrow/issues/34462 - expect = expect.astype(data_type) got = cudf.DataFrame.from_arrow(pa_table) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 0b252cec4b8..3cc6bfdbdc2 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1523,13 +1523,7 @@ def test_index_from_arrow(data): arrow_array = pa.Array.from_pandas(pdi) expected_index = pd.Index(arrow_array.to_pandas()) gdi = cudf.Index.from_arrow(arrow_array) - if gdi.dtype == cudf.dtype("datetime64[s]"): - # Arrow bug: - # https://github.com/apache/arrow/issues/33321 - # arrow cannot convert non-nanosecond - # resolution to appropriate type in pandas. - # Hence need to type-cast. - expected_index = expected_index.astype(gdi.dtype) + assert_eq(expected_index, gdi) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 1e175f5ff0d..cf3c0e7f7a0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): # Because of this, we aren't using PyArrow as a reference for testing our # row-group selection method since the only way to only select row groups # with PyArrow is with the method we use and intend to test. - tbl_filtered = pq.read_table( - fname, filters=[("1", ">", 60)], use_legacy_dataset=False - ) + tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64) print(len(df_filtered)) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6bd7558d322..9c7c687a6ed 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -247,7 +247,8 @@ File name to use for partitioned datasets. Different partitions will be written to different directories, but all files will have this name. If nothing is specified, a random uuid4 hex string - will be used for each file. + will be used for each file. This parameter is only supported by 'cudf' + engine, and will be ignored by other engines. partition_offsets : list, optional, default None Offsets to partition the dataframe by. Should be used when path is list of str. Should be a list of integers of size ``len(path) + 1`` diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index fc3a243572f..4b57bcd018a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "rmm==24.6.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -34,7 +34,7 @@ dependencies = [ "packaging", "pandas>=2.0,<2.2.3dev0", "ptxcompiler", - "pyarrow>=14.0.1,<15.0.0a0", + "pyarrow>=16.0.0,<17.0.0a0", "rich", "rmm==24.6.*", "typing_extensions>=4.0.0", @@ -63,7 +63,7 @@ test = [ "pytest<8", "scipy", "tokenizers==0.15.2", - "transformers==4.38.1", + "transformers==4.39.3", "tzdata", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index eb48852202a..787dd8a97d7 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 2c44f192612..39800145585 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir): pdf = ddf2.compute() pdf.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) - # Workaround until following issue is fixed: - # https://github.com/apache/arrow/issues/33321 - dd.assert_eq(ddf2, read_df.compute(), check_index_type=False) + dd.assert_eq(ddf2, read_df.compute()) @pytest.mark.parametrize("index", [False, None])