Merge branch 'branch-22.10' into fea-nvcomp-zstd-comp

rapidsai · Sep 12, 2022 · 1f60695 · 1f60695
2 parents f766f00 + dca285b
commit 1f60695
Show file tree

Hide file tree

Showing 72 changed files with 1,756 additions and 389 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,4 @@
 python/cudf/cudf/_version.py export-subst
-CHANGELOG.md merge=union
+python/cudf_kafka/cudf_kafka/_version.py export-subst
+python/custreamz/custreamz/_version.py export-subst
+python/dask_cudf/dask_cudf/_version.py export-subst
diff --git a/.gitignore b/.gitignore
@@ -24,16 +24,16 @@ cudf.egg-info/
 python/build
 python/*/build
 python/cudf/cudf-coverage.xml
-python/cudf/*/_lib/**/*\.cpp
+python/cudf/*/_lib/**/*.cpp
 python/cudf/*/_lib/**/*.h
 python/cudf/*/_lib/.nfs*
-python/cudf/*/_cuda/*\.cpp
+python/cudf/*/_cuda/*.cpp
 python/cudf/*.ipynb
 python/cudf/.ipynb_checkpoints
 python/*/record.txt
-python/cudf_kafka/*/_lib/**/*\.cpp
+python/cudf_kafka/*/_lib/**/*.cpp
 python/cudf_kafka/*/_lib/**/*.h
-python/custreamz/*/_lib/**/*\.cpp
+python/custreamz/*/_lib/**/*.cpp
 python/custreamz/*/_lib/**/*.h
 .Python
 env/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,10 +26,15 @@ repos:
                 files: python/.*\.(py|pyx|pxd)$
                 types: [file]
       - repo: https://github.com/pre-commit/mirrors-mypy
-        rev: 'v0.782'
+        rev: 'v0.971'
         hooks:
               - id: mypy
-                args: ["--config-file=setup.cfg", "python/cudf/cudf", "python/dask_cudf/dask_cudf", "python/custreamz/custreamz", "python/cudf_kafka/cudf_kafka"]
+                additional_dependencies: [types-cachetools]
+                args: ["--config-file=setup.cfg",
+                       "python/cudf/cudf",
+                       "python/custreamz/custreamz",
+                       "python/cudf_kafka/cudf_kafka",
+                       "python/dask_cudf/dask_cudf"]
                 pass_filenames: false
       - repo: https://github.com/PyCQA/pydocstyle
         rev: 6.1.1
@@ -88,6 +93,18 @@ repos:
                 language: system
                 pass_filenames: false
                 verbose: true
+              - id: headers-recipe-check
+                name: headers-recipe-check
+                entry: ./ci/checks/headers_test.sh
+                files: |
+                  (?x)^(
+                    ^cpp/include/|
+                    ^conda/.*/meta.yaml
+                  )
+                types_or: [file]
+                language: system
+                pass_filenames: false
+                verbose: false
 
 default_language_version:
       python: python3
diff --git a/ci/checks/headers_test.sh b/ci/checks/headers_test.sh
@@ -16,12 +16,9 @@ for DIRNAME in ${DIRNAMES[@]}; do
     LIB_RETVAL=$?
 
     if [ "$LIB_RETVAL" != "0" ]; then
-        echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; begin output\n\n"
+        echo -e ">>>> FAILED: lib${LIBNAME} has different headers in include/${DIRNAME}/ and conda/recipes/lib${LIBNAME}/meta.yaml. The diff is shown below:"
         echo -e "$HEADER_DIFF"
-        echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; end output\n\n"
         RETVAL=1
-    else
-        echo -e "\n\n>>>> PASSED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check\n\n"
     fi
 done
 

diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -49,13 +49,8 @@ else
   echo -e "\n\n>>>> PASSED: clang format check\n\n"
 fi
 
-# Run header meta.yml check and get results/return code
-HEADER_META=`ci/checks/headers_test.sh`
-HEADER_META_RETVAL=$?
-echo -e "$HEADER_META"
-
 RETVALS=(
-  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL
+  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
 )
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -152,6 +152,13 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
             echo "Running GoogleTest $test_name"
             ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
         done
+
+        # Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO`
+        for test_name in "CSV_TEST" "ORC_TEST" "PARQUET_TEST"; do
+            gt="$WORKSPACE/cpp/build/gtests/$test_name"
+            echo "Running GoogleTest $test_name (LIBCUDF_CUFILE_POLICY=KVIKIO)"
+            LIBCUDF_CUFILE_POLICY=KVIKIO ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
+        done
     fi
 else
     #Project Flash
@@ -182,10 +189,18 @@ else
     gpuci_logger "GoogleTests"
     # Run libcudf and libcudf_kafka gtests from libcudf-tests package
     for gt in "$CONDA_PREFIX/bin/gtests/libcudf"*/* ; do
+        test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    # Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO`
+    for test_name in "CSV_TEST" "ORC_TEST" "PARQUET_TEST"; do
+        gt="$CONDA_PREFIX/bin/gtests/libcudf/$test_name"
+        echo "Running GoogleTest $test_name (LIBCUDF_CUFILE_POLICY=KVIKIO)"
+        LIBCUDF_CUFILE_POLICY=KVIKIO ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
+    done
+
     export LIB_BUILD_DIR="$WORKSPACE/ci/artifacts/cudf/cpu/libcudf_work/cpp/build"
     # Copy libcudf build time results
     echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.xml"

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
@@ -36,14 +36,15 @@ dependencies:
   - nbsphinx
   - numpydoc
   - ipython
-  - pandoc=<2.0.0
+  - pandoc<=2.0.0
   - cudatoolkit=11.5
   - cuda-python >=11.5,<11.7.1
   - pip
   - flake8=3.8.3
   - black=22.3.0
   - isort=5.10.1
-  - mypy=0.782
+  - mypy=0.971
+  - types-cachetools
   - doxygen=1.8.20
   - pydocstyle=6.1.1
   - typing_extensions

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -749,6 +749,11 @@ if(CUDF_BUILD_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
 
+# build pretty-printer load script
+if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR)
+  configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY)
+endif()
+
 # ##################################################################################################
 # * install targets -------------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -165,7 +165,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
 ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp)
-ConfigureNVBench(SORT_NVBENCH sort/sort_structs.cpp)
+ConfigureNVBench(SORT_NVBENCH sort/sort_lists.cpp sort/sort_structs.cpp)
 
 # ##################################################################################################
 # * quantiles benchmark

diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_sort_lists(nvbench::state& state)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  const size_t size_bytes(state.get_int64("size_bytes"));
+  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
+  auto const null_frequency{state.get_float64("null_frequency")};
+
+  data_profile table_profile;
+  table_profile.set_distribution_params(cudf::type_id::LIST, distribution_id::UNIFORM, 0, 5);
+  table_profile.set_list_depth(depth);
+  table_profile.set_null_probability(null_frequency);
+  auto const table =
+    create_random_table({cudf::type_id::LIST}, table_size_bytes{size_bytes}, table_profile);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    cudf::detail::sorted_order(*table, {}, {}, stream_view, rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_sort_lists)
+  .set_name("sort_list")
+  .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
+  .add_int64_axis("depth", {1, 4})
+  .add_float64_axis("null_frequency", {0, 0.2});
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
@@ -57,6 +57,10 @@ class parquet_reader_options {
 
   // List of individual row groups to read (ignored if empty)
   std::vector<std::vector<size_type>> _row_groups;
+  // Number of rows to skip from the start
+  size_type _skip_rows = 0;
+  // Number of rows to read; -1 is all
+  size_type _num_rows = -1;
 
   // Whether to store string data as categorical type
   bool _convert_strings_to_categories = false;
@@ -127,6 +131,20 @@ class parquet_reader_options {
     return _reader_column_schema;
   }
 
+  /**
+   * @brief Returns number of rows to skip from the start.
+   *
+   * @return Number of rows to skip from the start
+   */
+  [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }
+
+  /**
+   * @brief Returns number of rows to read.
+   *
+   * @return Number of rows to read
+   */
+  [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
+
   /**
    * @brief Returns names of column to be read, if set.
    *
@@ -162,6 +180,10 @@ class parquet_reader_options {
    */
   void set_row_groups(std::vector<std::vector<size_type>> row_groups)
   {
+    if ((!row_groups.empty()) and ((_skip_rows != 0) or (_num_rows != -1))) {
+      CUDF_FAIL("row_groups can't be set along with skip_rows and num_rows");
+    }
+
     _row_groups = std::move(row_groups);
   }
 
@@ -190,6 +212,34 @@ class parquet_reader_options {
     _reader_column_schema = std::move(val);
   }
 
+  /**
+   * @brief Sets number of rows to skip.
+   *
+   * @param val Number of rows to skip from start
+   */
+  void set_skip_rows(size_type val)
+  {
+    if ((val != 0) and (!_row_groups.empty())) {
+      CUDF_FAIL("skip_rows can't be set along with a non-empty row_groups");
+    }
+
+    _skip_rows = val;
+  }
+
+  /**
+   * @brief Sets number of rows to read.
+   *
+   * @param val Number of rows to read after skip
+   */
+  void set_num_rows(size_type val)
+  {
+    if ((val != -1) and (!_row_groups.empty())) {
+      CUDF_FAIL("num_rows can't be set along with a non-empty row_groups");
+    }
+
+    _num_rows = val;
+  }
+
   /**
    * @brief Sets timestamp_type used to cast timestamp columns.
    *
@@ -279,6 +329,30 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets number of rows to skip.
+   *
+   * @param val Number of rows to skip from start
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& skip_rows(size_type val)
+  {
+    options.set_skip_rows(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets number of rows to read.
+   *
+   * @param val Number of rows to read after skip
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& num_rows(size_type val)
+  {
+    options.set_num_rows(val);
+    return *this;
+  }
+
   /**
    * @brief timestamp_type used to cast timestamp columns.
    *

diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -28,15 +28,11 @@ namespace cudf::detail {
  * @see the `dremel_data` struct for more info.
  */
 struct dremel_device_view {
-  // TODO: These elements are default initializable to support default
-  // initialization of the object. This is currently exploited to create views
-  // that will never actually be used. We should consider whether this
-  // represents a serious issue that should be worked around more robustly.
-  size_type const* offsets{};
-  uint8_t const* rep_levels{};
-  uint8_t const* def_levels{};
-  size_type const leaf_data_size{};
-  uint8_t const max_def_level{};
+  size_type const* offsets;
+  uint8_t const* rep_levels;
+  uint8_t const* def_levels;
+  size_type const leaf_data_size;
+  uint8_t const max_def_level;
 };
 
 /**