Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into stod-exp-min-check
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Apr 18, 2022
2 parents f6f246c + 94a5d41 commit 75db733
Show file tree
Hide file tree
Showing 54 changed files with 728 additions and 287 deletions.
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ void BM_csv_read_varying_input(benchmark::State& state)

auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::read_csv(read_options);
}
Expand Down Expand Up @@ -98,6 +99,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
// only read the header in the first chunk
Expand Down
28 changes: 28 additions & 0 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,31 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,

return selected_segments;
}

// Executes the command and returns stderr output
std::string exec_cmd(std::string_view cmd)
{
// Switch stderr and stdout to only capture stderr
auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
CUDF_EXPECTS(pipe != nullptr, "popen() failed");

std::array<char, 128> buffer;
std::string error_out;
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
error_out += buffer.data();
}
return error_out;
}

void try_drop_l3_cache()
{
static bool is_drop_cache_enabled = std::getenv("CUDF_BENCHMARK_DROP_CACHE") != nullptr;
if (not is_drop_cache_enabled) { return; }

std::array drop_cache_cmds{"/sbin/sysctl vm.drop_caches=3", "sudo /sbin/sysctl vm.drop_caches=3"};
CUDF_EXPECTS(std::any_of(drop_cache_cmds.cbegin(),
drop_cache_cmds.cend(),
[](auto& cmd) { return exec_cmd(cmd).empty(); }),
"Failed to execute the drop cache command");
}
10 changes: 10 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,13 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
* The segments could be Parquet row groups or ORC stripes.
*/
std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk);

/**
* @brief Drops L3 cache if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set.
*
* Has no effect if the environment variable is not set.
* May require sudo access ro run successfully.
*
* @throw cudf::logic_error if the environment variable is set and the command fails
*/
void try_drop_l3_cache();
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/orc/orc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ void BM_orc_read_varying_input(benchmark::State& state)

auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::read_orc(read_opts);
}
Expand Down Expand Up @@ -117,6 +118,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0

cudf::size_type rows_read = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/io/orc/orc_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/

#include "cudf/io/types.hpp"
#include <benchmark/benchmark.h>

#include <benchmarks/common/generate_input.hpp>
Expand All @@ -23,6 +22,7 @@
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/io/orc.hpp>
#include <cudf/io/types.hpp>

// to enable, run cmake with -DBUILD_BENCHMARKS=ON

Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ void BM_parq_read_varying_input(benchmark::State& state)

auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer const raii(state, true); // flush_l2_cache = true, stream = 0
cudf_io::read_parquet(read_opts);
}
Expand Down Expand Up @@ -117,6 +118,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0

cudf::size_type rows_read = 0;
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ static void BM_multibyte_split(benchmark::State& state)

auto mem_stats_logger = cudf::memory_stats_logger();
for (auto _ : state) {
try_drop_l3_cache();
cuda_event_timer raii(state, true);
auto output = cudf::io::text::multibyte_split(*source, delim);
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/sort/rank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include "cudf/column/column_view.hpp"
#include <cudf/column/column_view.hpp>
#include <cudf/sorting.hpp>

#include <cudf_test/base_fixture.hpp>
Expand Down
15 changes: 7 additions & 8 deletions cpp/benchmarks/string/convert_durations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>

#include <cudf/column/column_view.hpp>
#include <cudf/strings/convert/convert_durations.hpp>
#include <cudf/types.hpp>
#include <cudf/wrappers/durations.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/cudf_gtest.hpp>

#include <benchmark/benchmark.h>

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <algorithm>
#include <random>

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
#include "cudf/column/column_view.hpp"
#include "cudf/wrappers/durations.hpp"

class DurationsToString : public cudf::benchmark {
};
template <class TypeParam>
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/text/subword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <filesystem>
#include <fstream>
#include <iostream>
#include <vector>
Expand All @@ -29,7 +30,7 @@

static std::string create_hash_vocab_file()
{
std::string dir_template("/tmp");
std::string dir_template{std::filesystem::temp_directory_path().string()};
if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
std::string hash_file = dir_template + "/hash_vocab.txt";
// create a fake hashed vocab text file for this test
Expand Down
4 changes: 3 additions & 1 deletion cpp/cmake/thirdparty/get_cucollections.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ function(find_and_configure_cucollections)
cuco 0.0.1
GLOBAL_TARGETS cuco::cuco
BUILD_EXPORT_SET cudf-exports
INSTALL_EXPORT_SET cudf-exports
CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
)
if(NOT BUILD_SHARED_LIBS)
rapids_export_package(INSTALL cuco cudf-exports)
endif()

endfunction()

Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/reduction_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>

#include "cudf/lists/lists_column_view.hpp"
#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand Down
27 changes: 26 additions & 1 deletion cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -244,6 +244,7 @@ class column_in_metadata {
bool _use_int96_timestamp = false;
// bool _output_as_binary = false;
thrust::optional<uint8_t> _decimal_precision;
thrust::optional<int32_t> _parquet_field_id;
std::vector<column_in_metadata> children;

public:
Expand Down Expand Up @@ -324,6 +325,18 @@ class column_in_metadata {
return *this;
}

/**
* @brief Set the parquet field id of this column.
*
* @param field_id The parquet field id to set
* @return this for chaining
*/
column_in_metadata& set_parquet_field_id(int32_t field_id)
{
_parquet_field_id = field_id;
return *this;
}

/**
* @brief Get reference to a child of this column
*
Expand Down Expand Up @@ -379,6 +392,18 @@ class column_in_metadata {
*/
[[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }

/**
* @brief Get whether parquet field id has been set for this column.
*/
[[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); }

/**
* @brief Get the parquet field id that was set for this column.
* @throws If parquet field id was not set for this column.
* Check using `is_parquet_field_id_set()` first.
*/
[[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }

/**
* @brief Get the number of children of this column
*/
Expand Down
20 changes: 7 additions & 13 deletions cpp/include/cudf_test/file_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cstdio>
#include <cstdlib>
#include <filesystem>
#include <string>

#include <ftw.h>
Expand All @@ -34,29 +35,22 @@ class temp_directory {
public:
temp_directory(const std::string& base_name)
{
std::string dir_template("/tmp");
if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
std::string dir_template{std::filesystem::temp_directory_path().string()};
if (auto env_p = std::getenv("WORKSPACE")) dir_template = env_p;

dir_template += "/" + base_name + ".XXXXXX";
auto const tmpdirptr = mkdtemp(const_cast<char*>(dir_template.data()));
if (tmpdirptr == nullptr) CUDF_FAIL("Temporary directory creation failure: " + dir_template);
_path = dir_template + "/";
}
CUDF_EXPECTS(tmpdirptr != nullptr, "Temporary directory creation failure: " + dir_template);

static int rm_files(const char* pathname, const struct stat* sbuf, int type, struct FTW* ftwb)
{
return std::remove(pathname);
_path = dir_template + "/";
}

temp_directory& operator=(temp_directory const&) = delete;
temp_directory(temp_directory const&) = delete;
temp_directory& operator=(temp_directory&&) = default;
temp_directory(temp_directory&&) = default;

~temp_directory()
{
// TODO: should use std::filesystem instead, once C++17 support added
nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
}
~temp_directory() { std::filesystem::remove_all(std::filesystem::path{_path}); }

/**
* @brief Returns the path of the temporary directory
Expand Down
2 changes: 1 addition & 1 deletion cpp/libcudf_kafka/src/kafka_callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cudf_kafka/kafka_callback.hpp"
#include <cudf_kafka/kafka_callback.hpp>

#include <librdkafka/rdkafkacpp.h>

Expand Down
2 changes: 1 addition & 1 deletion cpp/libcudf_kafka/src/kafka_consumer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cudf_kafka/kafka_consumer.hpp"
#include <cudf_kafka/kafka_consumer.hpp>

#include <librdkafka/rdkafkacpp.h>

Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ bool CompactProtocolReader::read(SchemaElement* s)
ParquetFieldEnum<ConvertedType>(6, s->converted_type),
ParquetFieldInt32(7, s->decimal_scale),
ParquetFieldInt32(8, s->decimal_precision),
ParquetFieldOptionalInt32(9, s->field_id),
ParquetFieldStruct(10, s->logical_type));
return function_builder(this, op);
}
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/io/parquet/compact_protocol_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include "parquet.hpp"

#include <thrust/optional.h>

#include <algorithm>
#include <cstddef>
#include <string>
Expand Down Expand Up @@ -137,6 +139,7 @@ class CompactProtocolReader {
friend class ParquetFieldBool;
friend class ParquetFieldInt8;
friend class ParquetFieldInt32;
friend class ParquetFieldOptionalInt32;
friend class ParquetFieldInt64;
template <typename T>
friend class ParquetFieldStructListFunctor;
Expand Down Expand Up @@ -216,6 +219,27 @@ class ParquetFieldInt32 {
int field() { return field_val; }
};

/**
* @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader
*
* @return True if field type is not int32
*/
class ParquetFieldOptionalInt32 {
int field_val;
thrust::optional<int32_t>& val;

public:
ParquetFieldOptionalInt32(int f, thrust::optional<int32_t>& v) : field_val(f), val(v) {}

inline bool operator()(CompactProtocolReader* cpr, int field_type)
{
val = cpr->get_i32();
return (field_type != ST_FLD_I32);
}

int field() { return field_val; }
};

/**
* @brief Functor to set value to 64 bit integer read from CompactProtocolReader
*
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/compact_protocol_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
c.field_int(8, s.decimal_precision);
}
}
if (s.field_id) { c.field_int(9, s.field_id.value()); }
auto const isset = s.logical_type.isset;
// TODO: add handling for all logical types
// if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
Expand Down
Loading

0 comments on commit 75db733

Please sign in to comment.