Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-0.19' into mwilson/stru…
Browse files Browse the repository at this point in the history
…ct_join
  • Loading branch information
hyperbolic2346 committed Mar 31, 2021
2 parents 681b7af + 684bb14 commit d9a7f52
Show file tree
Hide file tree
Showing 67 changed files with 4,127 additions and 639 deletions.
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- dask>=2021.3.1
- distributed>=2.22.0
- streamz
- dlpack
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- dask>=2021.3.1
- distributed>=2.22.0
- streamz
- dlpack
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- mypy=0.782
- typing_extensions
- pre_commit
- dask>=2.22.0
- dask>=2021.3.1
- distributed>=2.22.0
- streamz
- dlpack
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ requirements:
- numba >=0.49.0
- dlpack
- pyarrow 1.0.1
- libcudf {{ version }}
- libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- rmm {{ minor_version }}
- cudatoolkit {{ cuda_version }}
run:
Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/cudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ requirements:
- python
- cython >=0.29,<0.30
- setuptools
- cudf {{ version }}
- libcudf_kafka {{ version }}
- cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
run:
- libcudf_kafka {{ version }}
- libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- python-confluent-kafka
- cudf {{ version }}
- cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}

test:
requires:
Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ requirements:
host:
- python
- python-confluent-kafka
- cudf_kafka {{ version }}
- cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
run:
- python
- streamz
- cudf {{ version }}
- streamz
- cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- dask >=2.22.0
- distributed >=2.22.0
- python-confluent-kafka
- cudf_kafka {{ version }}
- cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}

test:
requires:
Expand Down
10 changes: 5 additions & 5 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ build:
requirements:
host:
- python
- cudf {{ version }}
- dask >=2.22.0
- cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- dask>=2021.3.1
- distributed >=2.22.0
run:
- python
- cudf {{ version }}
- dask >=2.22.0
- cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- dask>=2021.3.1
- distributed >=2.22.0

test:
requires:
- cudatoolkit {{ cuda_version }}.*
Expand Down
2 changes: 2 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,14 @@ test:
- test -f $PREFIX/include/cudf/strings/detail/converters.hpp
- test -f $PREFIX/include/cudf/strings/detail/copying.hpp
- test -f $PREFIX/include/cudf/strings/detail/fill.hpp
- test -f $PREFIX/include/cudf/strings/detail/json.hpp
- test -f $PREFIX/include/cudf/strings/detail/replace.hpp
- test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
- test -f $PREFIX/include/cudf/strings/extract.hpp
- test -f $PREFIX/include/cudf/strings/findall.hpp
- test -f $PREFIX/include/cudf/strings/find.hpp
- test -f $PREFIX/include/cudf/strings/find_multiple.hpp
- test -f $PREFIX/include/cudf/strings/json.hpp
- test -f $PREFIX/include/cudf/strings/padding.hpp
- test -f $PREFIX/include/cudf/strings/replace.hpp
- test -f $PREFIX/include/cudf/strings/replace_re.hpp
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ requirements:
build:
- cmake >=3.17.0
host:
- libcudf {{ version }}
- libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
- librdkafka >=1.5.0,<1.5.3
run:
- {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ add_library(cudf
src/strings/find.cu
src/strings/find_multiple.cu
src/strings/padding.cu
src/strings/json/json_path.cu
src/strings/regex/regcomp.cpp
src/strings/regex/regexec.cu
src/strings/replace/backref_re.cu
Expand Down
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH
string/substring_benchmark.cpp
string/translate_benchmark.cpp
string/url_decode_benchmark.cpp)

###################################################################################################
# - json benchmark -------------------------------------------------------------------
ConfigureBench(JSON_BENCH
string/json_benchmark.cpp)
140 changes: 140 additions & 0 deletions cpp/benchmarks/string/json_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <cudf/strings/json.hpp>
#include <cudf/strings/strings_column_view.hpp>

class JsonPath : public cudf::benchmark {
};

float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); }

int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); }

std::vector<std::string> Books{
"{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the "
"Century\",\n\"price\": 8.95\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of "
"Honour\",\n\"price\": 12.99\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby "
"Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}",
"{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the "
"Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"};
constexpr int Approx_book_size = 110;
std::vector<std::string> Bicycles{
"{\"color\": \"red\", \"price\": 9.95}",
"{\"color\": \"green\", \"price\": 29.95}",
"{\"color\": \"blue\", \"price\": 399.95}",
"{\"color\": \"yellow\", \"price\": 99.95}",
"{\"color\": \"mauve\", \"price\": 199.95}",
};
constexpr int Approx_bicycle_size = 33;
std::string Misc{"\n\"expensive\": 10\n"};
std::string generate_field(std::vector<std::string> const& values, int num_values)
{
std::string res;
for (int idx = 0; idx < num_values; idx++) {
if (idx > 0) { res += std::string(",\n"); }
int vindex = std::min(static_cast<int>(floor(frand() * values.size())),
static_cast<int>(values.size() - 1));
res += values[vindex];
}
return res;
}

std::string build_row(int desired_bytes)
{
// always have at least 2 books and 2 bikes
int num_books = 2;
int num_bicycles = 2;
int remaining_bytes =
desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size));

// divide up the remainder between books and bikes
float book_pct = frand();
float bicycle_pct = 1.0f - book_pct;
num_books += (remaining_bytes * book_pct) / Approx_book_size;
num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;

std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n";
std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n";

std::string store = "\"store\": {\n";
if (frand() <= 0.5f) {
store += books + std::string(",\n") + bicycles;
} else {
store += bicycles + std::string(",\n") + books;
}
store += std::string("}\n");

std::string row = std::string("{\n");
if (frand() <= 0.5f) {
row += store + std::string(",\n") + Misc;
} else {
row += Misc + std::string(",\n") + store;
}
row += std::string("}\n");
return row;
}

template <class... QueryArg>
static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
{
srand(5236);
auto iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); });
int num_rows = state.range(0);
cudf::test::strings_column_wrapper input(iter, iter + num_rows);
cudf::strings_column_view scv(input);
size_t num_chars = scv.chars().size();

std::string json_path(query_arg...);

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
auto result = cudf::strings::get_json_object(scv, json_path);
cudaStreamSynchronize(0);
}

// this isn't strictly 100% accurate. a given query isn't necessarily
// going to visit every single incoming character. but in spirit it does.
state.SetBytesProcessed(state.iterations() * num_chars);
}

#define JSON_BENCHMARK_DEFINE(name, query) \
BENCHMARK_CAPTURE(BM_case, name, query) \
->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

JSON_BENCHMARK_DEFINE(query0, "$");
JSON_BENCHMARK_DEFINE(query1, "$.store");
JSON_BENCHMARK_DEFINE(query2, "$.store.book");
JSON_BENCHMARK_DEFINE(query3, "$.store.*");
JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
5 changes: 4 additions & 1 deletion cpp/include/cudf/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,13 @@ std::unique_ptr<aggregation> make_collect_list_aggregation(
* @param null_handling Indicates whether to include/exclude nulls during collection
* @param nulls_equal Flag to specify whether null entries within each list should be considered
* equal
* @param nans_equal Flag to specify whether NaN values in floating point column should be
* considered equal
*/
std::unique_ptr<aggregation> make_collect_set_aggregation(
null_policy null_handling = null_policy::INCLUDE,
null_equality null_equal = null_equality::EQUAL);
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL);

/// Factory to create a LAG aggregation
std::unique_ptr<aggregation> make_lag_aggregation(size_type offset);
Expand Down
18 changes: 13 additions & 5 deletions cpp/include/cudf/detail/aggregation/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,24 +345,32 @@ struct collect_list_aggregation final : derived_aggregation<nunique_aggregation>
*/
struct collect_set_aggregation final : derived_aggregation<collect_set_aggregation> {
explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
null_equality null_equal = null_equality::EQUAL)
: derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal)
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL)
: derived_aggregation{COLLECT_SET},
_null_handling{null_handling},
_nulls_equal(nulls_equal),
_nans_equal(nans_equal)
{
}
null_policy _null_handling; ///< include or exclude nulls
null_equality _null_equal; ///< whether to consider nulls as equal values
null_equality _nulls_equal; ///< whether to consider nulls as equal values
nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to
///< floating point types)

protected:
friend class derived_aggregation<collect_set_aggregation>;

bool operator==(collect_set_aggregation const& other) const
{
return _null_handling == other._null_handling && _null_equal == other._null_equal;
return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal &&
_nans_equal == other._nans_equal;
}

size_t hash_impl() const
{
return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_null_equal));
return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_nulls_equal) ^
static_cast<int>(_nans_equal));
}
};

Expand Down
32 changes: 32 additions & 0 deletions cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<double>::operator()(double c
return this->compute_floating_point(key);
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
{
cudf_assert(false && "List column hashing is not supported");
return 0;
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
MurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
{
cudf_assert(false && "Direct hashing of struct_view is not supported");
return 0;
}

template <typename Key>
struct SparkMurmurHash3_32 {
using argument_type = Key;
Expand Down Expand Up @@ -671,6 +687,22 @@ SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& ke
return this->compute<uint64_t>(key.value());
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
{
cudf_assert(false && "List column hashing is not supported");
return 0;
}

template <>
hash_value_type CUDA_DEVICE_CALLABLE
SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
{
cudf_assert(false && "Direct hashing of struct_view is not supported");
return 0;
}

/**
* @brief Specialization of MurmurHash3_32 operator for strings.
*/
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace detail {
std::unique_ptr<column> drop_list_duplicates(
lists_column_view const& lists_column,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace detail
Expand Down
3 changes: 3 additions & 0 deletions cpp/include/cudf/lists/drop_list_duplicates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ namespace lists {
*
* @param lists_column The input lists_column_view
* @param nulls_equal Flag to specify whether null entries should be considered equal
* @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only
* applicable for floating point data column)
* @param mr Device resource used to allocate memory
*
* @code{.pseudo}
Expand All @@ -56,6 +58,7 @@ namespace lists {
std::unique_ptr<column> drop_list_duplicates(
lists_column_view const& lists_column,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
Loading

0 comments on commit d9a7f52

Please sign in to comment.