Merge remote-tracking branch 'origin/branch-0.18' into collect_list

rapidsai · Jan 28, 2021 · 6558ac9 · 6558ac9
2 parents 4bcd852 + cbc0394
commit 6558ac9
Show file tree

Hide file tree

Showing 151 changed files with 7,333 additions and 3,127 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,6 +32,15 @@ repos:
                 language: system
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
                 args: ['-fallback-style=none']
+      - repo: local
+        hooks:
+              - id: mypy
+                name: mypy
+                description: mypy
+                pass_filenames: false
+                entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
+                language: system
+                types: [python]
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,8 +8,8 @@
 - PR #6929 Add `Index.set_names` api
 - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
 - PR #6885 Share `factorize` implementation with Index and cudf module
-
 - PR #6775 Implement cudf.DateOffset for months
+- PR #7039 Support contains() on lists of primitives
 
 ## Improvements
 

diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -29,6 +29,10 @@ FLAKE_RETVAL=$?
 FLAKE_CYTHON=`flake8 --config=python/.flake8.cython`
 FLAKE_CYTHON_RETVAL=$?
 
+# Run mypy and get results/return code
+MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
+MYPY_CUDF_RETVAL=$?
+
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
@@ -66,6 +70,14 @@ else
   echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n"
 fi
 
+if [ "$MYPY_CUDF_RETVAL" != "0" ]; then
+  echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n"
+  echo -e "$MYPY_CUDF"
+  echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n"
+else
+  echo -e "\n\n>>>> PASSED: mypy style check\n\n"
+fi
+
 if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
   echo -e "$CLANG_FORMAT"
@@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh`
 HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
-RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL)
+RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
 

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -34,6 +34,7 @@ requirements:
   run:
     - protobuf
     - python
+    - typing_extensions
     - pandas >=1.0,<1.2.0dev0
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -123,7 +123,9 @@ test:
     - test -f $PREFIX/include/cudf/join.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
+    - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
+    - test -f $PREFIX/include/cudf/lists/contains.hpp
     - test -f $PREFIX/include/cudf/lists/gather.hpp
     - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
     - test -f $PREFIX/include/cudf/merge.hpp
@@ -170,6 +172,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp
     - test -f $PREFIX/include/cudf/strings/substring.hpp
@@ -200,7 +203,6 @@ test:
     - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
     - test -f $PREFIX/include/cudf_test/cxxopts.hpp
     - test -f $PREFIX/include/cudf_test/file_utilities.hpp
-    - test -f $PREFIX/include/cudf_test/scalar_utilities.hpp
     - test -f $PREFIX/include/cudf_test/table_utilities.hpp
     - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
     - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -164,8 +164,8 @@ ConfigureBench(SEARCH_BENCH "${SEARCH_BENCH_SRC}")
 # - sort benchmark --------------------------------------------------------------------------------
 
 set(SORT_BENCH_SRC
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu"
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cu")
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cpp")
 
 ConfigureBench(SORT_BENCH "${SORT_BENCH_SRC}")
 

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -63,9 +63,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
 void BM_parq_write_varying_options(benchmark::State& state)
 {
-  auto const compression     = static_cast<cudf::io::compression_type>(state.range(0));
-  auto const enable_stats    = static_cast<cudf::io::statistics_freq>(state.range(1));
-  auto const output_metadata = state.range(2) != 0;
+  auto const compression  = static_cast<cudf::io::compression_type>(state.range(0));
+  auto const enable_stats = static_cast<cudf::io::statistics_freq>(state.range(1));
+  auto const file_path    = state.range(2) != 0 ? "unused_path.parquet" : "";
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
@@ -82,8 +82,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
       cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
         .stats_level(enable_stats)
-        .return_filemetadata(output_metadata)
-        .column_chunks_file_path("dummy_path.parquet");
+        .column_chunks_file_path(file_path);
     cudf_io::write_parquet(options);
   }
 

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,12 +72,11 @@ void PQ_write_chunked(benchmark::State& state)
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
       cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info());
-    auto writer_state = cudf_io::write_parquet_chunked_begin(opts);
-    std::for_each(
-      tables.begin(), tables.end(), [&writer_state](std::unique_ptr<cudf::table> const& tbl) {
-        cudf_io::write_parquet_chunked(*tbl, writer_state);
-      });
-    cudf_io::write_parquet_chunked_end(writer_state);
+    cudf_io::parquet_chunked_writer writer(opts);
+    std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr<cudf::table> const& tbl) {
+      writer.write(*tbl);
+    });
+    writer.close();
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));

diff --git a/cpp/benchmarks/sort/sort_benchmark.cu → cpp/benchmarks/sort/sort_benchmark.cpp b/cpp/benchmarks/sort/sort_benchmark.cu → cpp/benchmarks/sort/sort_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -24,18 +22,17 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/types.hpp>
-
-#include "../common/generate_benchmark_input.hpp"
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 template <bool stable>
 class Sort : public cudf::benchmark {
 };
 
 template <bool stable>
-static void BM_sort(benchmark::State& state)
+static void BM_sort(benchmark::State& state, bool nulls)
 {
   using Type           = int;
   using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
@@ -44,16 +41,16 @@ static void BM_sort(benchmark::State& state)
 
   const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
   const cudf::size_type n_cols{(cudf::size_type)state.range(1)};
-  auto type_size = cudf::size_of(cudf::data_type(cudf::type_to_id<Type>()));
 
   // Create columns with values in the range [0,100)
   std::vector<column_wrapper> columns;
   columns.reserve(n_cols);
   std::generate_n(std::back_inserter(columns), n_cols, [&, n_rows]() {
-    auto valids = cudf::test::make_counting_transform_iterator(
-      0, [](auto i) { return i % 100 == 0 ? false : true; });
     auto elements = cudf::test::make_counting_transform_iterator(
       0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids = cudf::test::make_counting_transform_iterator(
+      0, [](auto i) { return i % 100 == 0 ? false : true; });
     return column_wrapper(elements, elements + n_rows, valids);
   });
 
@@ -70,14 +67,16 @@ static void BM_sort(benchmark::State& state)
   }
 }
 
-#define SORT_BENCHMARK_DEFINE(name, stable)          \
-  BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable)    \
-  (::benchmark::State & st) { BM_sort<stable>(st); } \
-  BENCHMARK_REGISTER_F(Sort, name)                   \
-    ->RangeMultiplier(8)                             \
-    ->Ranges({{1 << 10, 1 << 26}, {1, 8}})           \
-    ->UseManualTime()                                \
+#define SORT_BENCHMARK_DEFINE(name, stable, nulls)          \
+  BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable)           \
+  (::benchmark::State & st) { BM_sort<stable>(st, nulls); } \
+  BENCHMARK_REGISTER_F(Sort, name)                          \
+    ->RangeMultiplier(8)                                    \
+    ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                  \
+    ->UseManualTime()                                       \
     ->Unit(benchmark::kMillisecond);
 
-SORT_BENCHMARK_DEFINE(sort_stable, true)
-SORT_BENCHMARK_DEFINE(sort_unstable, false)
+SORT_BENCHMARK_DEFINE(unstable_no_nulls, false, false)
+SORT_BENCHMARK_DEFINE(stable_no_nulls, true, false)
+SORT_BENCHMARK_DEFINE(unstable, false, true)
+SORT_BENCHMARK_DEFINE(stable, true, true)
diff --git a/...benchmarks/sort/sort_strings_benchmark.cu → ...enchmarks/sort/sort_strings_benchmark.cpp b/...benchmarks/sort/sort_strings_benchmark.cu → ...enchmarks/sort/sort_strings_benchmark.cpp
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
@@ -99,7 +99,6 @@ __launch_bounds__(block_size) __global__
 {
   T* __restrict__ output_data                   = output_view.data<T>();
   cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  constexpr cudf::size_type leader_lane{0};
   static_assert(block_size <= 1024, "Maximum thread block size exceeded");
 
   int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
@@ -109,8 +108,8 @@ __launch_bounds__(block_size) __global__
   __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
   __shared__ T temp_data[block_size];
 
-  cudf::size_type warp_valid_counts{0};
-  cudf::size_type block_sum = 0;
+  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
+  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
 
   // Note that since the maximum gridDim.x on all supported GPUs is as big as
   // cudf::size_type, this loop is sufficient to cover our maximum column size
@@ -160,6 +159,8 @@ __launch_bounds__(block_size) __global__
       const int wid        = threadIdx.x / cudf::detail::warp_size;
       const int lane       = threadIdx.x % cudf::detail::warp_size;
 
+      cudf::size_type tmp_warp_valid_counts{0};
+
       if (tmp_block_sum > 0 && wid <= last_warp) {
         int valid_index = (block_offset / cudf::detail::warp_size) + wid;
 
@@ -168,9 +169,8 @@ __launch_bounds__(block_size) __global__
 
         // Note the atomicOr's below assume that output_valid has been set to
         // all zero before the kernel
-
         if (lane == 0 && valid_warp != 0) {
-          warp_valid_counts = __popc(valid_warp);
+          tmp_warp_valid_counts = __popc(valid_warp);
           if (wid > 0 && wid < last_warp)
             output_valid[valid_index] = valid_warp;
           else {
@@ -182,19 +182,22 @@ __launch_bounds__(block_size) __global__
         if ((wid == 0) && (last_warp == num_warps)) {
           uint32_t valid_warp = __ballot_sync(0xffffffff, temp_valids[block_size + threadIdx.x]);
           if (lane == 0 && valid_warp != 0) {
-            warp_valid_counts += __popc(valid_warp);
+            tmp_warp_valid_counts += __popc(valid_warp);
             atomicOr(&output_valid[valid_index + num_warps], valid_warp);
           }
         }
       }
+      warp_valid_counts += tmp_warp_valid_counts;
     }
 
     block_offset += tmp_block_sum;
     tid += block_size;
   }
   // Compute total null_count for this block and add it to global count
+  constexpr cudf::size_type leader_lane{0};
   cudf::size_type block_valid_count =
     cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
+
   if (threadIdx.x == 0) {  // one thread computes and adds to null count
     atomicAdd(output_null_count, block_sum - block_valid_count);
   }

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
@@ -174,6 +174,21 @@ auto inline make_validity_iterator(column_device_view const& column)
                                          validity_accessor{column});
 }
 
+/**
+ * @brief Constructs a constant device iterator over a scalar's validity.
+ *
+ * Dereferencing the returned iterator returns a `bool`.
+ *
+ * For `p = *(iter + i)`, `p` is the validity of the scalar.
+ *
+ * @param scalar_value The scalar to iterate
+ * @return auto Iterator that returns scalar validity
+ */
+auto inline make_validity_iterator(scalar const& scalar_value)
+{
+  return thrust::make_constant_iterator(scalar_value.is_valid());
+}
+
 /**
  * @brief value accessor for scalar with valid data.
  * The unary functor returns data of Element type of the scalar.

diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/include/cudf/detail/utilities/trie.cuh
@@ -135,24 +135,22 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
  * @return Boolean value, true if string is found, false otherwise
  */
 __host__ __device__ inline bool serialized_trie_contains(device_span<SerialTrieNode const> trie,
-                                                         char const *key,
-                                                         size_t key_len)
+                                                         device_span<char const> key)
 {
   if (trie.data() == nullptr || trie.empty()) return false;
-  if (key_len == 0) return trie[0].is_leaf;
-  int curr_node = 1;
-  for (size_t i = 0; i < key_len; ++i) {
+  if (key.empty()) return trie.front().is_leaf;
+  auto curr_node = trie.begin() + 1;
+  for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) {
     // Don't jump away from root node
-    if (i != 0) { curr_node += trie[curr_node].children_offset; }
+    if (curr_key != key.begin()) { curr_node += curr_node->children_offset; }
     // Search for the next character in the array of children nodes
     // Nodes are sorted - terminate search if the node is larger or equal
-    while (trie[curr_node].character != trie_terminating_character &&
-           trie[curr_node].character < key[i]) {
+    while (curr_node->character != trie_terminating_character && curr_node->character < *curr_key) {
       ++curr_node;
     }
     // Could not find the next character, done with the search
-    if (trie[curr_node].character != key[i]) { return false; }
+    if (curr_node->character != *curr_key) { return false; }
   }
   // Even if the node is present, return true only if that node is at the end of a word
-  return trie[curr_node].is_leaf;
+  return curr_node->is_leaf;
 }