Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fix-csv_…

…reader_options-cython-bug
rapidsai · Nov 2, 2022 · 72a78be · 72a78be
2 parents dc228e5 + 03034af
commit 72a78be
Show file tree

Hide file tree

Showing 194 changed files with 2,499 additions and 1,383 deletions.
diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
@@ -0,0 +1,20 @@
+name: Add new issue/PR to project
+
+on:
+  issues:
+    types:
+      - opened
+
+  pull_request_target:
+    types:
+      - opened
+
+jobs:
+  add-to-project:
+    name: Add issue or PR to project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/[email protected]
+        with:
+          project-url: https://github.com/orgs/rapidsai/projects/51
+          github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,14 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 repos:
+      - repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v4.3.0
+        hooks:
+              - id: end-of-file-fixer
+                exclude: |
+                  (?x)^(
+                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
+                  )
       - repo: https://github.com/PyCQA/isort
         rev: 5.10.1
         hooks:
@@ -26,6 +34,10 @@ repos:
                 types: [file]
                 types_or: [python, cython]
                 additional_dependencies: ["flake8-force"]
+      - repo: https://github.com/MarcoGorelli/cython-lint
+        rev: v0.1.10
+        hooks:
+              - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
         rev: 'v0.971'
         hooks:
@@ -48,6 +60,16 @@ repos:
               - id: clang-format
                 types_or: [c, c++, cuda]
                 args: ["-fallback-style=none", "-style=file", "-i"]
+      - repo: https://github.com/sirosen/texthooks
+        rev: 0.4.0
+        hooks:
+              - id: fix-smartquotes
+                exclude: |
+                  (?x)^(
+                    ^cpp/include/cudf_test/cxxopts.hpp|
+                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
+                    ^python/cudf/cudf/tests/test_text.py
+                  )
       - repo: local
         hooks:
               - id: no-deprecationwarning

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ For additional examples, browse our complete [API documentation](https://docs.ra
 
 ## Quick Start
 
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
+Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
 
 ## Installation
 
@@ -65,32 +65,21 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
 
-For `cudf version == 22.06` :
 ```bash
-# for CUDA 11.0
-conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=22.06 python=3.9 cudatoolkit=11.0
-
-# or, for CUDA 11.2
-conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=22.06 python=3.9 cudatoolkit=11.2
-
+# for CUDA 11.5
+conda install -c rapidsai -c conda-forge -c nvidia \
+    cudf=22.10 python=3.9 cudatoolkit=11.5
+# for CUDA 11.2
+conda install -c rapidsai -c conda-forge -c nvidia \
+    cudf=22.10 python=3.9 cudatoolkit=11.2
 ```
 
-For the nightly version of `cudf` :
-```bash
-# for CUDA 11.0
-conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge \
-    cudf python=3.9 cudatoolkit=11.0
-
-# or, for CUDA 11.2
-conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge \
-    cudf python=3.9 cudatoolkit=11.2
-```
+We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
+of our latest development branch.
 
 Note: cuDF is supported only on Linux, and with Python versions 3.8 and later.
 
-See the [Get RAPIDS version picker](https://rapids.ai/start.html) for more OS and version info. 
+See the [Get RAPIDS version picker](https://rapids.ai/start.html) for more OS and version info.
 
 ## Build/Install from Source
 See build [instructions](CONTRIBUTING.md#setting-up-your-build-environment).

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
@@ -3,10 +3,10 @@
 name: cudf_dev
 channels:
   - rapidsai
-  - nvidia
   - rapidsai-nightly
   - dask/label/dev
   - conda-forge
+  - nvidia
 dependencies:
   - c-compiler
   - cxx-compiler
@@ -38,7 +38,7 @@ dependencies:
   - ipython
   - pandoc<=2.0.0
   - cudatoolkit=11.5
-  - cuda-python>=11.5,<11.7.1
+  - cuda-python>=11.7.1,<12.0
   - pip
   - doxygen=1.8.20
   - typing_extensions

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,7 @@ requirements:
     - packaging
     - cachetools
     - cubinlinker  # [linux64]  # CUDA enhanced compatibility.
-    - cuda-python >=11.5,<11.7.1
+    - cuda-python >=11.7.1,<12.0
 test:                                   # [linux64]
   requires:                             # [linux64]
     - cudatoolkit {{ cuda_version }}.*  # [linux64]

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -114,9 +114,10 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/unary.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
         - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/pinned_allocator.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
         - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
@@ -149,6 +150,7 @@ outputs:
         - test -f $PREFIX/include/cudf/io/json.hpp
         - test -f $PREFIX/include/cudf/io/orc.hpp
         - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
+        - test -f $PREFIX/include/cudf/io/orc_types.hpp
         - test -f $PREFIX/include/cudf/io/parquet.hpp
         - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp

diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml
@@ -40,7 +40,7 @@ requirements:
     - numba >=0.54
     - libcudf ={{ version }}
     - cudf ={{ version }}
-    - cudatoolkit ={{ cuda_version }}
+    - cudatoolkit {{ cuda_version }}.*
   run:
     - python
     - typing_extensions

diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -21,8 +21,8 @@
 
 #include <cudf/io/csv.hpp>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 256 << 20;
 constexpr cudf::size_type num_cols = 64;
 

diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 

diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -21,17 +21,27 @@
 #include <benchmarks/io/nvbench_helpers.hpp>
 
 #include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
+// The number of separate read calls to use when reading files in multiple chunks
+// Each call reads roughly equal amounts of data
+constexpr int32_t chunked_read_num_chunks = 8;
 
 std::vector<std::string> get_col_names(cudf::io::source_info const& source)
 {
-  cudf::io::orc_reader_options const read_options =
-    cudf::io::orc_reader_options::builder(source).num_rows(1);
-  return cudf::io::read_orc(read_options).metadata.column_names;
+  auto const top_lvl_cols = cudf::io::read_orc_metadata(source).schema().root().children();
+  std::vector<std::string> col_names;
+  std::transform(top_lvl_cols.cbegin(),
+                 top_lvl_cols.cend(),
+                 std::back_inserter(col_names),
+                 [](auto const& col_meta) { return col_meta.name(); });
+  return col_names;
 }
 
 template <column_selection ColSelection,
@@ -48,7 +58,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
 {
   cudf::rmm_pool_raii rmm_pool;
 
-  auto constexpr num_chunks = 1;
+  auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;
 
   auto const use_index     = UsesIndex == uses_index::YES;
   auto const use_np_dtypes = UsesNumpyDType == uses_numpy_dtype::YES;
@@ -79,7 +89,8 @@ void BM_orc_read_varying_options(nvbench::state& state,
       .use_np_dtypes(use_np_dtypes)
       .timestamp_type(ts_type);
 
-  auto const num_stripes              = data_size / (64 << 20);
+  auto const num_stripes =
+    cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
@@ -94,14 +105,9 @@ void BM_orc_read_varying_options(nvbench::state& state,
         auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
-          case row_selection::STRIPES: {
-            auto stripes_to_read = segments_in_chunk(num_stripes, num_chunks, chunk);
-            if (is_last_chunk) {
-              // Need to assume that an additional "overflow" stripe is present
-              stripes_to_read.push_back(num_stripes);
-            }
-            read_options.set_stripes({stripes_to_read});
-          } break;
+          case row_selection::STRIPES:
+            read_options.set_stripes({segments_in_chunk(num_stripes, num_chunks, chunk)});
+            break;
           case row_selection::NROWS:
             read_options.set_skip_rows(chunk * chunk_row_cnt);
             read_options.set_num_rows(chunk_row_cnt);
@@ -129,6 +135,8 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;
 
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(col_selections,
@@ -141,11 +149,22 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
     {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
   .set_min_samples(4);
 
+NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
+                                      row_selections,
+                                      nvbench::enum_type_list<uses_index::YES>,
+                                      nvbench::enum_type_list<uses_numpy_dtype::YES>,
+                                      nvbench::enum_type_list<cudf::type_id::EMPTY>))
+  .set_name("orc_read_row_selection")
+  .set_type_axes_names(
+    {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
+  .set_min_samples(4);
+
 NVBENCH_BENCH_TYPES(
   BM_orc_read_varying_options,
   NVBENCH_TYPE_AXES(
     nvbench::enum_type_list<column_selection::ALL>,
-    nvbench::enum_type_list<row_selection::NROWS>,
+    nvbench::enum_type_list<row_selection::ALL>,
     nvbench::enum_type_list<uses_index::YES, uses_index::NO>,
     nvbench::enum_type_list<uses_numpy_dtype::YES, uses_numpy_dtype::NO>,
     nvbench::enum_type_list<cudf::type_id::EMPTY, cudf::type_id::TIMESTAMP_NANOSECONDS>))

diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -38,6 +38,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   },
   [](auto) { return std::string{}; })
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 

diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
@@ -29,8 +29,8 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
 void nvbench_orc_write(nvbench::state& state)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr std::size_t data_size      = 512 << 20;
 constexpr std::size_t row_group_size = 128 << 20;
 

diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -25,8 +25,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
 NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   cudf::io::statistics_freq,
   [](auto value) {
@@ -39,6 +37,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   },
   [](auto) { return std::string{}; })
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -27,8 +27,8 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
 void PQ_write(nvbench::state& state)

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -23,6 +23,7 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/pinned_allocator.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -33,7 +34,6 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <thrust/transform.h>
 
 #include <nvbench/nvbench.cuh>
@@ -136,10 +136,9 @@ static void bench_multibyte_split(nvbench::state& state,
 
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
   std::unique_ptr<cudf::io::datasource> datasource;
-  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
-  auto host_input   = std::vector<char>{};
-  auto host_pinned_input =
-    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};
+  auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input        = std::vector<char>{};
+  auto host_pinned_input = thrust::host_vector<char, cudf::detail::pinned_allocator<char>>{};
 
   if (source_type != data_chunk_source_type::device &&
       source_type != data_chunk_source_type::host_pinned) {

diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
@@ -61,4 +61,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_scan, NVBENCH_TYPE_AXES(data_type))
                     1000000,    // 1M
                     10000000,   // 10M
                     100000000,  // 100M
-                  });
+                  });