merge

rapidsai · Mar 7, 2023 · 8ff7170 · 8ff7170
2 parents f200a5a + 97d8d12
commit 8ff7170
Show file tree

Hide file tree

Showing 62 changed files with 1,048 additions and 315 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -113,9 +113,8 @@ jobs:
       build_type: pull-request
       package-name: cudf
       # Install cupy-cuda11x for arm from a special index url
-      # Install tokenizers last binary wheel to avoid a Rust compile from the latest sdist
-      test-before-arm64: "pip install tokenizers==0.10.2 cupy-cuda11x -f https://pip.cupy.dev/aarch64"
-      test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests"
+      test-before-arm64: "python -m pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v -n 8 ./python/cudf/cudf/tests"
       test-smoketest: "python ./ci/wheel_smoke_test_cudf.py"
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
@@ -125,7 +124,7 @@ jobs:
       build_type: pull-request
       package-name: dask_cudf
       package-dir: python/dask_cudf
-      before-wheel: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf && pip install --no-deps ./local-cudf/cudf*.whl"
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf && python -m pip install --no-deps ./local-cudf/cudf*.whl"
       uses-setup-env-vars: false
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
@@ -134,5 +133,5 @@ jobs:
     with:
       build_type: pull-request
       package-name: dask_cudf
-      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && pip install --no-deps ./local-cudf-dep/cudf*.whl"
-      test-unittest: "pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
+      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl"
+      test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -86,8 +86,8 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: cudf
-      test-before-arm64: "pip install tokenizers==0.10.2 cupy-cuda11x -f https://pip.cupy.dev/aarch64"
-      test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests"
+      test-before-arm64: "python -m pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v -n 8 ./python/cudf/cudf/tests"
   wheel-tests-dask-cudf:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/[email protected]
@@ -97,4 +97,4 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: dask_cudf
-      test-unittest: "pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
+      test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -41,7 +41,8 @@ popd
 
 
 if [[ ${RAPIDS_BUILD_TYPE} == "branch" ]]; then
-  aws s3 sync --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html"
-  aws s3 sync --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html"
-  aws s3 sync --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt"
+  rapids-logger "Upload Docs to S3"
+  aws s3 sync --no-progress --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html"
+  aws s3 sync --no-progress --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html"
+  aws s3 sync --no-progress --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt"
 fi
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
@@ -12,22 +12,19 @@ sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/dask_cudf/dask
 sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
 sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/custreamz/custreamz/__init__.py
 
-# setup.py versions
-sed -i "s/version=.*,/version=\"${VERSION}\",/g" python/cudf/setup.py
-sed -i "s/version=.*,/version=\"${VERSION}\",/g" python/dask_cudf/setup.py
-sed -i "s/version=.*,/version=\"${VERSION}\",/g" python/cudf_kafka/setup.py
-sed -i "s/version=.*,/version=\"${VERSION}\",/g" python/custreamz/setup.py
-
-# cudf setup.py cuda suffixes
-sed -i "s/name=\"cudf\"/name=\"cudf${CUDA_SUFFIX}\"/g" python/cudf/setup.py
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cudf/setup.py
-sed -i "s/ptxcompiler/ptxcompiler${CUDA_SUFFIX}/g" python/cudf/setup.py
-sed -i "s/cubinlinker/cubinlinker${CUDA_SUFFIX}/g" python/cudf/setup.py
+# pyproject.toml versions
+sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
+sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml
+sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf_kafka/pyproject.toml
+sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/custreamz/pyproject.toml
 
 # cudf pyproject.toml cuda suffixes
+sed -i "s/^name = \"cudf\"/name = \"cudf${CUDA_SUFFIX}\"/g" python/cudf/pyproject.toml
 sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
+sed -i "s/ptxcompiler/ptxcompiler${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
+sed -i "s/cubinlinker/cubinlinker${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
 
-# dask_cudf setup.py cuda suffixes
-sed -i "s/name=\"dask-cudf\"/name=\"dask-cudf${CUDA_SUFFIX}\"/g" python/dask_cudf/setup.py
+# dask_cudf pyproject.toml cuda suffixes
+sed -i "s/^name = \"dask_cudf\"/name = \"dask_cudf${CUDA_SUFFIX}\"/g" python/dask_cudf/pyproject.toml
 # Need to provide the == to avoid modifying the URL
-sed -i "s/\"cudf==/\"cudf${CUDA_SUFFIX}==/g" python/dask_cudf/setup.py
+sed -i "s/\"cudf==/\"cudf${CUDA_SUFFIX}==/g" python/dask_cudf/pyproject.toml
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -49,11 +49,11 @@ sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/dask
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/custreamz/custreamz/__init__.py
 
-# Python setup.py updates
-sed_runner "s/version=.*,/version=\"${NEXT_FULL_TAG}\",/g" python/cudf/setup.py
-sed_runner "s/version=.*,/version=\"${NEXT_FULL_TAG}\",/g" python/dask_cudf/setup.py
-sed_runner "s/version=.*,/version=\"${NEXT_FULL_TAG}\",/g" python/cudf_kafka/setup.py
-sed_runner "s/version=.*,/version=\"${NEXT_FULL_TAG}\",/g" python/custreamz/setup.py
+# Python pyproject.toml updates
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf/pyproject.toml
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/pyproject.toml
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
@@ -89,12 +89,9 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 # Need to distutils-normalize the original version
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 
-# Dependency versions in setup.py
-sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup.py
-sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py
-
 # Dependency versions in pyproject.toml
 sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
+sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/pyproject.toml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"

diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
@@ -17,31 +17,31 @@ trap "EXITCODE=1" ERR
 set +e
 
 rapids-logger "pytest dask_cudf"
-pushd python/dask_cudf
+pushd python/dask_cudf/dask_cudf
 pytest \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=loadscope \
-  --cov-config=.coveragerc \
+  --cov-config=../.coveragerc \
   --cov=dask_cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term \
-  dask_cudf
+  tests
 popd
 
 rapids-logger "pytest custreamz"
-pushd python/custreamz
+pushd python/custreamz/custreamz
 pytest \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
   --numprocesses=8 \
   --dist=loadscope \
-  --cov-config=.coveragerc \
+  --cov-config=../.coveragerc \
   --cov=custreamz \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \
   --cov-report=term \
-  custreamz
+  tests
 popd
 
 rapids-logger "Test script exiting with value: $EXITCODE"

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -28,11 +28,12 @@ dependencies:
 - doxygen=1.8.20
 - fastavro>=0.22.9
 - fsspec>=0.6.0
-- gcc_linux-64=9.*
+- gcc_linux-64=11.*
 - hypothesis
 - ipython
 - libarrow=10
 - librdkafka=1.7.0
+- librmm=23.04.*
 - mimesis>=4.1.0
 - moto>=4.0.8
 - myst-nb
@@ -49,7 +50,7 @@ dependencies:
 - pandoc<=2.0.0
 - pip
 - pre-commit
-- protobuf=4.21
+- protobuf>=4.21.6,<4.22
 - ptxcompiler
 - pyarrow=10
 - pydata-sphinx-theme

diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 9
+  - 11
 
 cxx_compiler_version:
-  - 9
+  - 11
 
 sysroot_version:
   - "2.17"

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -45,7 +45,7 @@ requirements:
     - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf =4.21
+    - protobuf >=4.21.6,<4.22
     - python
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
@@ -57,7 +57,7 @@ requirements:
     - rmm ={{ minor_version }}
     - cudatoolkit ={{ cuda_version }}
   run:
-    - protobuf =4.21
+    - protobuf >=4.21.6,<4.22
     - python
     - typing_extensions
     - pandas >=1.0,<1.6.0dev0

diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 9
+  - 11
 
 cxx_compiler_version:
-  - 9
+  - 11
 
 sysroot_version:
   - "2.17"
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 9
+  - 11
 
 cxx_compiler_version:
-  - 9
+  - 11
 
 cuda_compiler:
   - nvcc

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -301,7 +301,8 @@ ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengt
 # * json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH string/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
-ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp)
+ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
+ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------

diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size         = 512 << 20;
+constexpr cudf::size_type num_cols = 64;
+
+void json_read_common(cudf::io::json_writer_options const& write_opts,
+                      cuio_source_sink_pair& source_sink,
+                      nvbench::state& state)
+{
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_opts =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               try_drop_l3_cache();
+
+               timer.start();
+               cudf::io::read_json(read_opts);
+               timer.stop();
+             });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <cudf::io::io_type IO>
+void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_type<IO>>)
+{
+  auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                         static_cast<int32_t>(data_type::FLOAT),
+                                         static_cast<int32_t>(data_type::DECIMAL),
+                                         static_cast<int32_t>(data_type::TIMESTAMP),
+                                         static_cast<int32_t>(data_type::DURATION),
+                                         static_cast<int32_t>(data_type::STRING),
+                                         static_cast<int32_t>(data_type::LIST),
+                                         static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const source_type = IO;
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null");
+
+  json_read_common(write_opts, source_sink, state);
+}
+
+template <data_type DataType, cudf::io::io_type IO>
+void BM_json_read_data_type(
+  nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
+{
+  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
+  auto const source_type = IO;
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null");
+
+  json_read_common(write_opts, source_sink, state);
+}
+
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::DURATION,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
+                                        cudf::io::io_type::HOST_BUFFER,
+                                        cudf::io::io_type::DEVICE_BUFFER>;
+
+using compression_list =
+  nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
+
+NVBENCH_BENCH_TYPES(BM_json_read_data_type,
+                    NVBENCH_TYPE_AXES(d_type_list,
+                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+  .set_name("json_read_data_type")
+  .set_type_axes_names({"data_type", "io"})
+  .set_min_samples(4);
+
+NVBENCH_BENCH_TYPES(BM_json_read_io, NVBENCH_TYPE_AXES(io_list))
+  .set_name("json_read_io")
+  .set_type_axes_names({"io"})
+  .set_min_samples(4);