Merge remote-tracking branch 'upstream/branch-0.19' into mwilson/stru…

…ct_join
rapidsai · Mar 31, 2021 · d9a7f52 · d9a7f52
2 parents 681b7af + 684bb14
commit d9a7f52
Show file tree

Hide file tree

Showing 67 changed files with 4,127 additions and 639 deletions.
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -43,7 +43,7 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2.22.0
+  - dask>=2021.3.1
   - distributed>=2.22.0
   - streamz
   - dlpack

diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
@@ -43,7 +43,7 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2.22.0
+  - dask>=2021.3.1
   - distributed>=2.22.0
   - streamz
   - dlpack

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
@@ -43,7 +43,7 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2.22.0
+  - dask>=2021.3.1
   - distributed>=2.22.0
   - streamz
   - dlpack

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -28,7 +28,7 @@ requirements:
     - numba >=0.49.0
     - dlpack
     - pyarrow 1.0.1
-    - libcudf {{ version }}
+    - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
   run:

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
@@ -29,12 +29,12 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - cudf {{ version }}
-    - libcudf_kafka {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
   run:
-    - libcudf_kafka {{ version }}
+    - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - python-confluent-kafka
-    - cudf {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
 
 test:
   requires:

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
@@ -23,15 +23,15 @@ requirements:
   host:
     - python
     - python-confluent-kafka
-    - cudf_kafka {{ version }}
+    - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
   run:
     - python
-    - streamz 
-    - cudf {{ version }}
+    - streamz
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - dask >=2.22.0
     - distributed >=2.22.0
     - python-confluent-kafka
-    - cudf_kafka {{ version }}
+    - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
 
 test:
   requires:

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
@@ -22,15 +22,15 @@ build:
 requirements:
   host:
     - python
-    - cudf {{ version }}
-    - dask >=2.22.0
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
+    - dask>=2021.3.1
     - distributed >=2.22.0
   run:
     - python
-    - cudf {{ version }}
-    - dask >=2.22.0
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
+    - dask>=2021.3.1
     - distributed >=2.22.0
-  
+
 test:
   requires:
     - cudatoolkit {{ cuda_version }}.*

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -178,12 +178,14 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/converters.hpp
     - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
     - test -f $PREFIX/include/cudf/strings/find.hpp
     - test -f $PREFIX/include/cudf/strings/find_multiple.hpp
+    - test -f $PREFIX/include/cudf/strings/json.hpp
     - test -f $PREFIX/include/cudf/strings/padding.hpp
     - test -f $PREFIX/include/cudf/strings/replace.hpp
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
@@ -25,7 +25,7 @@ requirements:
   build:
     - cmake >=3.17.0
   host:
-    - libcudf {{ version }}
+    - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -346,6 +346,7 @@ add_library(cudf
     src/strings/find.cu
     src/strings/find_multiple.cu
     src/strings/padding.cu
+    src/strings/json/json_path.cu
     src/strings/regex/regcomp.cpp
     src/strings/regex/regexec.cu
     src/strings/replace/backref_re.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH
   string/substring_benchmark.cpp
   string/translate_benchmark.cpp
   string/url_decode_benchmark.cpp)
+
+###################################################################################################
+# - json benchmark -------------------------------------------------------------------
+ConfigureBench(JSON_BENCH
+  string/json_benchmark.cpp)
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/json.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+class JsonPath : public cudf::benchmark {
+};
+
+float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); }
+
+int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); }
+
+std::vector<std::string> Books{
+  "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the "
+  "Century\",\n\"price\": 8.95\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of "
+  "Honour\",\n\"price\": 12.99\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby "
+  "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the "
+  "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"};
+constexpr int Approx_book_size = 110;
+std::vector<std::string> Bicycles{
+  "{\"color\": \"red\", \"price\": 9.95}",
+  "{\"color\": \"green\", \"price\": 29.95}",
+  "{\"color\": \"blue\", \"price\": 399.95}",
+  "{\"color\": \"yellow\", \"price\": 99.95}",
+  "{\"color\": \"mauve\", \"price\": 199.95}",
+};
+constexpr int Approx_bicycle_size = 33;
+std::string Misc{"\n\"expensive\": 10\n"};
+std::string generate_field(std::vector<std::string> const& values, int num_values)
+{
+  std::string res;
+  for (int idx = 0; idx < num_values; idx++) {
+    if (idx > 0) { res += std::string(",\n"); }
+    int vindex = std::min(static_cast<int>(floor(frand() * values.size())),
+                          static_cast<int>(values.size() - 1));
+    res += values[vindex];
+  }
+  return res;
+}
+
+std::string build_row(int desired_bytes)
+{
+  // always have at least 2 books and 2 bikes
+  int num_books    = 2;
+  int num_bicycles = 2;
+  int remaining_bytes =
+    desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size));
+
+  // divide up the remainder between books and bikes
+  float book_pct    = frand();
+  float bicycle_pct = 1.0f - book_pct;
+  num_books += (remaining_bytes * book_pct) / Approx_book_size;
+  num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;
+
+  std::string books    = "\"book\": [\n" + generate_field(Books, num_books) + "]\n";
+  std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n";
+
+  std::string store = "\"store\": {\n";
+  if (frand() <= 0.5f) {
+    store += books + std::string(",\n") + bicycles;
+  } else {
+    store += bicycles + std::string(",\n") + books;
+  }
+  store += std::string("}\n");
+
+  std::string row = std::string("{\n");
+  if (frand() <= 0.5f) {
+    row += store + std::string(",\n") + Misc;
+  } else {
+    row += Misc + std::string(",\n") + store;
+  }
+  row += std::string("}\n");
+  return row;
+}
+
+template <class... QueryArg>
+static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
+{
+  srand(5236);
+  auto iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); });
+  int num_rows = state.range(0);
+  cudf::test::strings_column_wrapper input(iter, iter + num_rows);
+  cudf::strings_column_view scv(input);
+  size_t num_chars = scv.chars().size();
+
+  std::string json_path(query_arg...);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    auto result = cudf::strings::get_json_object(scv, json_path);
+    cudaStreamSynchronize(0);
+  }
+
+  // this isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character.  but in spirit it does.
+  state.SetBytesProcessed(state.iterations() * num_chars);
+}
+
+#define JSON_BENCHMARK_DEFINE(name, query)                         \
+  BENCHMARK_CAPTURE(BM_case, name, query)                          \
+    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \
+    ->UseManualTime()                                              \
+    ->Unit(benchmark::kMillisecond);
+
+JSON_BENCHMARK_DEFINE(query0, "$");
+JSON_BENCHMARK_DEFINE(query1, "$.store");
+JSON_BENCHMARK_DEFINE(query2, "$.store.book");
+JSON_BENCHMARK_DEFINE(query3, "$.store.*");
+JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
+JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
+JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
+JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
+JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
@@ -230,10 +230,13 @@ std::unique_ptr<aggregation> make_collect_list_aggregation(
  * @param null_handling Indicates whether to include/exclude nulls during collection
  * @param nulls_equal   Flag to specify whether null entries within each list should be considered
  * equal
+ * @param nans_equal    Flag to specify whether NaN values in floating point column should be
+ * considered equal
  */
 std::unique_ptr<aggregation> make_collect_set_aggregation(
   null_policy null_handling = null_policy::INCLUDE,
-  null_equality null_equal  = null_equality::EQUAL);
+  null_equality nulls_equal = null_equality::EQUAL,
+  nan_equality nans_equal   = nan_equality::UNEQUAL);
 
 /// Factory to create a LAG aggregation
 std::unique_ptr<aggregation> make_lag_aggregation(size_type offset);

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation<nunique_aggregation>
  */
 struct collect_set_aggregation final : derived_aggregation<collect_set_aggregation> {
   explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
-                                   null_equality null_equal  = null_equality::EQUAL)
-    : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal)
+                                   null_equality nulls_equal = null_equality::EQUAL,
+                                   nan_equality nans_equal   = nan_equality::UNEQUAL)
+    : derived_aggregation{COLLECT_SET},
+      _null_handling{null_handling},
+      _nulls_equal(nulls_equal),
+      _nans_equal(nans_equal)
   {
   }
   null_policy _null_handling;  ///< include or exclude nulls
-  null_equality _null_equal;   ///< whether to consider nulls as equal values
+  null_equality _nulls_equal;  ///< whether to consider nulls as equal values
+  nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
+                               ///< floating point types)
 
  protected:
   friend class derived_aggregation<collect_set_aggregation>;
 
   bool operator==(collect_set_aggregation const& other) const
   {
-    return _null_handling == other._null_handling && _null_equal == other._null_equal;
+    return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal &&
+           _nans_equal == other._nans_equal;
   }
 
   size_t hash_impl() const
   {
-    return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_null_equal));
+    return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_nulls_equal) ^
+                            static_cast<int>(_nans_equal));
   }
 };
 

diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<double>::operator()(double c
   return this->compute_floating_point(key);
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+{
+  cudf_assert(false && "List column hashing is not supported");
+  return 0;
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+{
+  cudf_assert(false && "Direct hashing of struct_view is not supported");
+  return 0;
+}
+
 template <typename Key>
 struct SparkMurmurHash3_32 {
   using argument_type = Key;
@@ -671,6 +687,22 @@ SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& ke
   return this->compute<uint64_t>(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+{
+  cudf_assert(false && "List column hashing is not supported");
+  return 0;
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+{
+  cudf_assert(false && "Direct hashing of struct_view is not supported");
+  return 0;
+}
+
 /**
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */

diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
@@ -31,6 +31,7 @@ namespace detail {
 std::unique_ptr<column> drop_list_duplicates(
   lists_column_view const& lists_column,
   null_equality nulls_equal,
+  nan_equality nans_equal,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail

diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
@@ -41,6 +41,8 @@ namespace lists {
  *
  * @param lists_column The input lists_column_view
  * @param nulls_equal  Flag to specify whether null entries should be considered equal
+ * @param nans_equal   Flag to specify whether NaN entries should be considered as equal value (only
+ * applicable for floating point data column)
  * @param mr           Device resource used to allocate memory
  *
  * @code{.pseudo}
@@ -56,6 +58,7 @@ namespace lists {
 std::unique_ptr<column> drop_list_duplicates(
   lists_column_view const& lists_column,
   null_equality nulls_equal           = null_equality::EQUAL,
+  nan_equality nans_equal             = nan_equality::UNEQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group