From 64b38101a7b095f5ff7d8af691c91d759163f424 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 May 2024 02:13:14 +0000 Subject: [PATCH 1/6] Upgrade arrow to 16 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 12 ++++++------ conda/environments/all_cuda-122_arch-x86_64.yaml | 12 ++++++------ conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- cpp/cmake/thirdparty/get_arrow.cmake | 2 +- dependencies.yaml | 12 ++++++------ python/cudf/pyproject.toml | 4 ++-- python/cudf_kafka/pyproject.toml | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 7a5fef9f25e..48699b81eed 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -36,15 +36,15 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -66,7 +66,7 @@ dependencies: - pip - pre-commit - ptxcompiler -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 @@ -92,7 +92,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 48453e18bb0..d06a727f331 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -37,13 +37,13 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev - libcurand-dev - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -63,7 +63,7 @@ dependencies: - pandoc - pip - pre-commit -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pynvjitlink - pytest-benchmark @@ -90,7 +90,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index ddcadfd1570..24210830ada 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -65,7 +65,7 @@ requirements: - setuptools - dlpack >=0.8,<1.0 - numpy 1.23 - - pyarrow ==14.0.2.* + - pyarrow ==16.0.0.* - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index ba5e96fb6cf..61ffcf3c3de 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -20,7 +20,7 @@ cmake_version: - ">=3.26.4" libarrow_version: - - "==14.0.2" + - "==16.0.0" dlpack_version: - ">=0.8,<1.0" diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 892056959c8..70283efbd79 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 14.0.2 + 16.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/dependencies.yaml b/dependencies.yaml index 1508656471d..640c6e2dee9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -266,7 +266,7 @@ dependencies: - cython>=3.0.3 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - pyarrow==14.0.2.* + - pyarrow==16.0.0.* - output_types: conda packages: - scikit-build-core>=0.7.0 @@ -312,10 +312,10 @@ dependencies: packages: # Hard pin the Arrow patch version used during the build. This must # be kept in sync with the version pinned in get_arrow.cmake. - - libarrow-acero==14.0.2.* - - libarrow-dataset==14.0.2.* - - libarrow==14.0.2.* - - libparquet==14.0.2.* + - libarrow-acero==16.0.0.* + - libarrow-dataset==16.0.0.* + - libarrow==16.0.0.* + - libparquet==16.0.0.* libarrow_run: common: - output_types: conda @@ -631,7 +631,7 @@ dependencies: packages: - msgpack - &tokenizers tokenizers==0.15.2 - - &transformers transformers==4.38.1 + - &transformers transformers==4.39.3 - tzdata specific: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index fc3a243572f..9471a4b649b 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "rmm==24.6.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -63,7 +63,7 @@ test = [ "pytest<8", "scipy", "tokenizers==0.15.2", - "transformers==4.38.1", + "transformers==4.39.3", "tzdata", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index eb48852202a..787dd8a97d7 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 595f8cff7ed555e8c00374e831ef91ad63da9419 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 May 2024 02:20:50 +0000 Subject: [PATCH 2/6] runtime versions --- dependencies.yaml | 10 +++++----- python/cudf/pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 640c6e2dee9..32893ea495c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -322,17 +322,17 @@ dependencies: packages: # Allow runtime version to float up to minor version # Disallow libarrow 14.0.0 due to a CVE - - libarrow-acero>=14.0.1,<15.0.0a0 - - libarrow-dataset>=14.0.1,<15.0.0a0 - - libarrow>=14.0.1,<15.0.0a0 - - libparquet>=14.0.1,<15.0.0a0 + - libarrow-acero>=16.0.0,<17.0.0a0 + - libarrow-dataset>=16.0.0,<17.0.0a0 + - libarrow>=16.0.0,<17.0.0a0 + - libparquet>=16.0.0,<17.0.0a0 pyarrow_run: common: - output_types: [conda, requirements, pyproject] packages: # Allow runtime version to float up to minor version # Disallow pyarrow 14.0.0 due to a CVE - - pyarrow>=14.0.1,<15.0.0a0 + - pyarrow>=16.0.0,<17.0.0a0 cuda_version: specific: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 9471a4b649b..4b57bcd018a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "packaging", "pandas>=2.0,<2.2.3dev0", "ptxcompiler", - "pyarrow>=14.0.1,<15.0.0a0", + "pyarrow>=16.0.0,<17.0.0a0", "rich", "rmm==24.6.*", "typing_extensions>=4.0.0", From dd755de8c43040eec4669c5b9c7f487676572985 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 May 2024 03:05:23 +0000 Subject: [PATCH 3/6] fix code --- python/cudf/cudf/io/parquet.py | 5 ----- python/cudf/cudf/tests/test_parquet.py | 4 +--- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..dd1e59acaaa 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -993,15 +993,10 @@ def to_parquet( if index is None: index = True - # Convert partition_file_name to a call back - if partition_file_name: - partition_file_name = lambda x: partition_file_name # noqa: E731 - pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, root_path=path, - partition_filename_cb=partition_file_name, partition_cols=partition_cols, *args, **kwargs, diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 1e175f5ff0d..cf3c0e7f7a0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): # Because of this, we aren't using PyArrow as a reference for testing our # row-group selection method since the only way to only select row groups # with PyArrow is with the method we use and intend to test. - tbl_filtered = pq.read_table( - fname, filters=[("1", ">", 60)], use_legacy_dataset=False - ) + tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64) print(len(df_filtered)) From 6362432a618bc9dc6e464a64e3747ae7c6ac1104 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 May 2024 03:24:09 +0000 Subject: [PATCH 4/6] Fix pytests --- .../tests/data/parquet/usec_timestamp.parquet | Bin 1128 -> 2323 bytes python/cudf/cudf/tests/test_dataframe.py | 14 +------------- python/cudf/cudf/tests/test_index.py | 8 +------- .../dask_cudf/io/tests/test_parquet.py | 4 +--- 4 files changed, 3 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet index 20ef3cc5578426c7fa4fabe32692f43d3cd1c66c..efde6ff11bf97254c84827f1fe2703035fae4c34 100644 GIT binary patch literal 2323 zcmcguL2u$l6dnj6v}z=}!U!Y|$l67fO0qy8Evxh}kN~EH1hTP#)e3oR17>YQz=p6< zj=l9z^|Z%TkL{(uqsN|BJyzf;Zo>4#+}aKt1mF-IA2|y}f+XAx($z#=D ziuk9}K+#3U2ibV$g#G2g3j_4i-09Q;Eh_8`wHT&6D*7Wr~GBn8RByd^{aMj;>>cGi8|Egi` zXV;xC#di{5`?TYoD55!<*g4TuG7%>Wnv(HCoJ>Y@3N}+FQ*~reX?lM*wVPsrtn2!Y zGP4VR>@|vff}70~;f2#2kw3Sd)A4z~h%s)!q~mLhv$)X5C5-#H*tcbc!is)Qg-1*M zA*Scy9fpl!>p6_)7|y|JgTU$|CLzTkAt>Xi9xD^s753U3nQ zSUph-gwygp5o|lHL64OAOh>rv9hdSwPD@R>Vs%I|muVv8TpLYVaQj@z1iajU2=C_WcFw@AoJ*7PoZ{T@srTfHHn(-fYMSU8;5Ri+o%aDRO{(ev*+khUPh=8T0Gc3R z12Kn%Rlq2uYz9z7?EnkRw8^ Date: Wed, 8 May 2024 22:35:32 -0500 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: Bradley Dice --- dependencies.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 32893ea495c..7fe67817f73 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -321,7 +321,6 @@ dependencies: - output_types: conda packages: # Allow runtime version to float up to minor version - # Disallow libarrow 14.0.0 due to a CVE - libarrow-acero>=16.0.0,<17.0.0a0 - libarrow-dataset>=16.0.0,<17.0.0a0 - libarrow>=16.0.0,<17.0.0a0 @@ -331,7 +330,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: # Allow runtime version to float up to minor version - # Disallow pyarrow 14.0.0 due to a CVE - pyarrow>=16.0.0,<17.0.0a0 cuda_version: specific: From 68259ee702935390687cab060d88433948e0f601 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 May 2024 13:42:28 +0000 Subject: [PATCH 6/6] document --- python/cudf/cudf/utils/ioutils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6bd7558d322..9c7c687a6ed 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -247,7 +247,8 @@ File name to use for partitioned datasets. Different partitions will be written to different directories, but all files will have this name. If nothing is specified, a random uuid4 hex string - will be used for each file. + will be used for each file. This parameter is only supported by 'cudf' + engine, and will be ignored by other engines. partition_offsets : list, optional, default None Offsets to partition the dataframe by. Should be used when path is list of str. Should be a list of integers of size ``len(path) + 1``