Merge branch 'branch-22.06' into list-get-sequence-input

rapidsai · Apr 11, 2022 · 0facf27 · 0facf27
2 parents aee243d + 012af64
commit 0facf27
Show file tree

Hide file tree

Showing 42 changed files with 2,941 additions and 3,194 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
@@ -82,8 +82,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
     gpuci_logger "gpuci_mamba_retry update dask"
     gpuci_mamba_retry update dask
 else
-    gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall"
-    gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall
+    gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall"
+    gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall
 fi
 
 # Install the master version of streamz

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # ucx-py version
 export UCX_PY_VERSION='0.26.*'
@@ -112,8 +112,8 @@ function install_dask {
         gpuci_mamba_retry update dask
         conda list
     else
-        gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall"
-        gpuci_mamba_retry install conda-forge::dask==2022.03.0 conda-forge::distributed==2022.03.0 conda-forge::dask-core==2022.03.0 --force-reinstall
+        gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall"
+        gpuci_mamba_retry install conda-forge::dask>=2022.03.0 conda-forge::distributed>=2022.03.0 conda-forge::dask-core>=2022.03.0 --force-reinstall
     fi
     # Install the main version of streamz
     gpuci_logger "Install the main version of streamz"

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
@@ -43,8 +43,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask==2022.03.0
-  - distributed==2022.03.0
+  - dask>=2022.03.0
+  - distributed>=2022.03.0
   - streamz
   - arrow-cpp=7.0.0
   - dlpack>=0.5,<0.6.0a0

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
@@ -32,8 +32,8 @@ requirements:
     - python
     - streamz
     - cudf {{ version }}
-    - dask==2022.03.0
-    - distributed==2022.03.0
+    - dask>=2022.03.0
+    - distributed>=2022.03.0
     - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
 

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask==2022.03.0
-    - distributed==2022.03.0
+    - dask>=2022.03.0
+    - distributed>=2022.03.0
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask==2022.03.0
-    - distributed==2022.03.0
+    - dask>=2022.03.0
+    - distributed>=2022.03.0
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]

diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
@@ -80,5 +80,5 @@ void nvbench_sort_struct(nvbench::state& state)
 NVBENCH_BENCH(nvbench_sort_struct)
   .set_name("sort_struct")
   .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
-  .add_int64_axis("Depth", {1, 8})
+  .add_int64_axis("Depth", {0, 1, 8})
   .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
@@ -19,48 +19,99 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/filling.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 class StringContains : public cudf::benchmark {
 };
 
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
+{
+  // build input table using the following data
+  auto data      = cudf::test::strings_column_wrapper({
+    "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
+    "012345 6789 01234 56789 0123 456",  // the rest do not match
+    "abc 4567890 DEFGHI 0987 Wxyz 123",
+    "abcdefghijklmnopqrstuvwxyz 01234",
+    "",
+    "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+    "9876543210,abcdefghijklmnopqrstU",
+    "9876543210,abcdefghijklmnopqrstU",
+    "123 édf 4567890 DéFG 0987 X5",
+    "1",
+  });
+  auto data_view = cudf::column_view(data);
+
+  // compute number of rows in n_rows that should match
+  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
+
+  // Create a randomized gather-map to build a column out of the strings in data.
+  data_profile gather_profile;
+  gather_profile.set_distribution_params(
+    cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
+  gather_profile.set_null_frequency(0.0);  // no nulls for gather-map
+  gather_profile.set_cardinality(0);
+  auto gather_table =
+    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
+  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+
+  // Create scatter map by placing 0-index values throughout the gather-map
+  auto scatter_data = cudf::sequence(
+    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
+  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
+  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
+  auto gather_map  = table->view().column(0);
+  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
+
+  return std::move(table->release().front());
+}
+
 enum contains_type { contains, count, findall };
 
+// longer pattern lengths demand more working memory per string
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+
 static void BM_contains(benchmark::State& state, contains_type ct)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
-  cudf::strings_column_view input(table->view().column(0));
+  auto const n_rows        = static_cast<cudf::size_type>(state.range(0));
+  auto const pattern_index = static_cast<int32_t>(state.range(1));
+  auto const hit_rate      = static_cast<int32_t>(state.range(2));
+
+  auto col   = build_input_column(n_rows, hit_rate);
+  auto input = cudf::strings_column_view(col->view());
+
+  auto pattern = patterns[pattern_index];
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-    // contains_re(), matches_re(), and count_re() all have similar functions
-    // with count_re() being the most regex intensive
     switch (ct) {
       case contains_type::contains:  // contains_re and matches_re use the same main logic
-        cudf::strings::contains_re(input, "\\d+");
+        cudf::strings::contains_re(input, pattern);
         break;
-      case contains_type::count:  // counts occurrences of pattern
-        cudf::strings::count_re(input, "\\d+");
+      case contains_type::count:  // counts occurrences of matches
+        cudf::strings::count_re(input, pattern);
         break;
-      case contains_type::findall:  // returns occurrences of matches
-        cudf::strings::findall(input, "\\d+");
+      case contains_type::findall:  // returns occurrences of all matches
+        cudf::strings::findall(input, pattern);
         break;
     }
   }
 
   state.SetBytesProcessed(state.iterations() * input.chars_size());
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name, b)                          \
-  BENCHMARK_DEFINE_F(StringContains, name)                         \
-  (::benchmark::State & st) { BM_contains(st, contains_type::b); } \
-  BENCHMARK_REGISTER_F(StringContains, name)                       \
-    ->RangeMultiplier(8)                                           \
-    ->Ranges({{1 << 12, 1 << 24}})                                 \
-    ->UseManualTime()                                              \
+#define STRINGS_BENCHMARK_DEFINE(name, b)                                         \
+  BENCHMARK_DEFINE_F(StringContains, name)                                        \
+  (::benchmark::State & st) { BM_contains(st, contains_type::b); }                \
+  BENCHMARK_REGISTER_F(StringContains, name)                                      \
+    ->ArgsProduct({{4096, 32768, 262144, 2097152, 16777216}, /* row count */      \
+                   {0, 1},                                   /* patterns index */ \
+                   {1, 5, 10, 25, 70, 100}})                 /* hit rate */       \
+    ->UseManualTime()                                                             \
     ->Unit(benchmark::kMillisecond);
 
 STRINGS_BENCHMARK_DEFINE(contains_re, contains)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,8 +44,8 @@ class json_reader_options_builder;
 /**
  * @brief Input arguments to the `read_json` interface.
  *
- * Available parameters and are closely patterned after PANDAS' `read_json` API.
- * Not all parameters are unsupported. If the matching PANDAS' parameter
+ * Available parameters are closely patterned after PANDAS' `read_json` API.
+ * Not all parameters are supported. If the matching PANDAS' parameter
  * has a default value of `None`, then a default value of `-1` or `0` may be
  * used as the equivalent.
  *

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -197,7 +197,11 @@ class device_row_comparator {
           return cuda::std::make_pair(state, depth);
         }
 
-        // Structs have been modified to only have 1 child when using this.
+        if (lcol.num_child_columns() == 0) {
+          return cuda::std::make_pair(weak_ordering::EQUIVALENT, depth);
+        }
+
+        // Non-empty structs have been modified to only have 1 child when using this.
         lcol = lcol.children()[0];
         rcol = rcol.children()[0];
         ++depth;

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
@@ -1179,6 +1179,19 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                     codec_stats{parquet::SNAPPY, 0, 0},
                                     codec_stats{parquet::BROTLI, 0, 0}};
 
+  auto is_codec_supported = [&codecs](int8_t codec) {
+    if (codec == parquet::UNCOMPRESSED) return true;
+    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
+             return codec == cstats.compression_type;
+           }) != codecs.end();
+  };
+  CUDF_EXPECTS(std::all_of(chunks.begin(),
+                           chunks.end(),
+                           [&is_codec_supported](auto const& chunk) {
+                             return is_codec_supported(chunk.codec);
+                           }),
+               "Unsupported compression type");
+
   for (auto& codec : codecs) {
     for_each_codec_page(codec.compression_type, [&](size_t page) {
       auto page_uncomp_size = pages[page].uncompressed_page_size;

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/count_matches.hpp>
 #include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
@@ -114,6 +115,26 @@ std::unique_ptr<column> matches_re(
   return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
 }
 
+std::unique_ptr<column> count_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 regex_flags const flags,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  // compile regex into device object
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+
+  auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
+  if (input.has_nulls()) {
+    result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                          input.null_count());
+  }
+  return result;
+}
+
 }  // namespace detail
 
 // external APIs
@@ -136,78 +157,6 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
   return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
-namespace detail {
-namespace {
-/**
- * @brief This counts the number of times the regex pattern matches in each string.
- */
-template <int stack_size>
-struct count_fn {
-  reprog_device prog;
-  column_device_view const d_strings;
-
-  __device__ int32_t operator()(unsigned int idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto const d_str   = d_strings.element<string_view>(idx);
-    auto const nchars  = d_str.length();
-    int32_t find_count = 0;
-    int32_t begin      = 0;
-    while (begin < nchars) {
-      auto end = static_cast<int32_t>(nchars);
-      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) break;
-      ++find_count;
-      begin = end > begin ? end : begin + 1;
-    }
-    return find_count;
-  }
-};
-
-struct count_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(strings_column_view const& input,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto results = make_numeric_column(data_type{type_id::INT32},
-                                       input.size(),
-                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                       input.null_count(),
-                                       stream,
-                                       mr);
-
-    auto const d_strings = column_device_view::create(input.parent(), stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      results->mutable_view().data<int32_t>(),
-                      count_fn<stack_size>{d_prog, *d_strings});
-    return results;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<column> count_re(
-  strings_column_view const& input,
-  std::string const& pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  // compile regex into device object
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
-
-  return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr);
-}
-
-}  // namespace detail
-
-// external API
-
 std::unique_ptr<column> count_re(strings_column_view const& strings,
                                  std::string const& pattern,
                                  regex_flags const flags,

diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
@@ -298,12 +298,14 @@ struct parse_datetime {
         }
         case 'z': {
           // 'z' format is +hh:mm -- single sign char and 2 chars each for hour and minute
-          auto const sign     = *ptr == '-' ? 1 : -1;
-          auto const [hh, lh] = parse_int(ptr + 1, 2);
-          auto const [mm, lm] = parse_int(ptr + 3, 2);
-          // revert timezone back to UTC
-          timeparts.tz_minutes = sign * ((hh * 60) + mm);
-          bytes_read -= lh + lm;
+          if (item.length == 5) {
+            auto const sign     = *ptr == '-' ? 1 : -1;
+            auto const [hh, lh] = parse_int(ptr + 1, 2);
+            auto const [mm, lm] = parse_int(ptr + 3, 2);
+            // revert timezone back to UTC
+            timeparts.tz_minutes = sign * ((hh * 60) + mm);
+            bytes_read -= lh + lm;
+          }
           break;
         }
         case 'Z': break;  // skip
@@ -574,6 +576,8 @@ struct check_datetime_format {
             auto const cvm = check_value(ptr + 3, 2, 0, 59);
             result         = (*ptr == '-' || *ptr == '+') && cvh.first && cvm.first;
             bytes_read -= cvh.second + cvm.second;
+          } else if (item.length == 1) {
+            result = *ptr == 'Z';
           }
           break;
         }