Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.04' into insert_opti…
Browse files Browse the repository at this point in the history
…mizations
  • Loading branch information
galipremsagar committed Feb 16, 2022
2 parents d3e062f + f263820 commit 42515d2
Show file tree
Hide file tree
Showing 39 changed files with 613 additions and 377 deletions.
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ repos:
# of dependencies, so we'll have to update this manually.
additional_dependencies:
- cmake-format==0.6.11
- id: copyright-check
name: copyright-check
# This hook's use of Git tools appears to conflict with
# existing CI invocations so we don't invoke it during CI runs.
stages: [commit]
entry: python ./ci/checks/copyright.py --git-modified-only
language: python

default_language_version:
python: python3
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ ConfigureBench(
string/split.cpp
string/substring.cpp
string/translate.cpp
string/url_decode.cpp
string/url_decode.cu
)

# ##################################################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,55 +16,68 @@

#include <benchmark/benchmark.h>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/convert/convert_urls.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/types.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/cudf_gtest.hpp>

#include <algorithm>
#include <random>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>

struct url_string_generator {
size_t num_chars;
std::bernoulli_distribution dist;

url_string_generator(size_t num_chars, double esc_seq_chance)
: num_chars{num_chars}, dist{esc_seq_chance}
char* chars;
double esc_seq_chance;
thrust::minstd_rand engine;
thrust::uniform_real_distribution<float> esc_seq_dist;
url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine)
: chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1)
{
}

std::string operator()(std::mt19937& engine)
__device__ void operator()(thrust::tuple<cudf::size_type, cudf::size_type> str_begin_end)
{
std::string str;
str.reserve(num_chars);
while (str.size() < num_chars) {
if (str.size() < num_chars - 3 && dist(engine)) {
str += "%20";
auto begin = thrust::get<0>(str_begin_end);
auto end = thrust::get<1>(str_begin_end);
engine.discard(begin);
for (auto i = begin; i < end; ++i) {
if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) {
chars[i] = '%';
chars[i + 1] = '2';
chars[i + 2] = '0';
i += 2;
} else {
str.push_back('a');
chars[i] = 'a';
}
}
return str;
}
};

cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows,
cudf::size_type chars_per_row,
double esc_seq_chance)
auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance)
{
std::mt19937 engine(1);
url_string_generator url_gen(chars_per_row, esc_seq_chance);
std::vector<std::string> strings;
strings.reserve(num_rows);
std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); });
return cudf::test::strings_column_wrapper(strings.begin(), strings.end());
std::vector<std::string> strings{std::string(chars_per_row, 'a')};
auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end());
auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows);
auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa...
auto chars_col = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view();
auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view();

auto engine = thrust::default_random_engine{};
thrust::for_each_n(thrust::device,
thrust::make_zip_iterator(offset_col.begin<cudf::size_type>(),
offset_col.begin<cudf::size_type>() + 1),
num_rows,
url_string_generator{chars_col.begin<char>(), esc_seq_chance, engine});
return result_col;
}

class UrlDecode : public cudf::benchmark {
Expand All @@ -76,7 +89,7 @@ void BM_url_decode(benchmark::State& state, int esc_seq_pct)
cudf::size_type const chars_per_row = state.range(1);

auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0);
auto strings_view = cudf::strings_column_view(column);
auto strings_view = cudf::strings_column_view(column->view());

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
Expand Down
24 changes: 11 additions & 13 deletions cpp/benchmarks/type_dispatcher/type_dispatcher.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,15 +14,16 @@
* limitations under the License.
*/

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>

Expand Down Expand Up @@ -170,21 +171,18 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa
template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
void type_dispatcher_benchmark(::benchmark::State& state)
{
const auto source_size = static_cast<cudf::size_type>(state.range(1));

const auto n_cols = static_cast<cudf::size_type>(state.range(0));

const auto n_cols = static_cast<cudf::size_type>(state.range(0));
const auto source_size = static_cast<cudf::size_type>(state.range(1));
const auto work_per_thread = static_cast<cudf::size_type>(state.range(2));

auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
auto init = cudf::make_fixed_width_scalar<TypeParam>(static_cast<TypeParam>(0));

std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> source_column_wrappers;
std::vector<std::unique_ptr<cudf::column>> source_column_wrappers;
std::vector<cudf::mutable_column_view> source_columns;

for (int i = 0; i < n_cols; ++i) {
source_column_wrappers.push_back(
cudf::test::fixed_width_column_wrapper<TypeParam>(data, data + source_size));
source_columns.push_back(source_column_wrappers[i]);
source_column_wrappers.push_back(cudf::sequence(source_size, *init));
source_columns.push_back(*source_column_wrappers[i]);
}
cudf::mutable_table_view source_table{source_columns};

Expand Down
36 changes: 22 additions & 14 deletions cpp/src/io/utilities/file_io_utilities.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -194,20 +194,13 @@ template <typename DataT,
std::vector<std::future<ResultT>> make_sliced_tasks(
F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
{
constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
auto const slices = make_file_io_slices(size, max_slice_size);
std::vector<std::future<ResultT>> slice_tasks;
constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024;
static auto const max_slice_bytes =
getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes);
size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
size_t slice_offset = 0;
for (size_t t = 0; t < n_slices; ++t) {
DataT* ptr_slice = ptr + slice_offset;

size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes;
slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset));

slice_offset += slice_size;
}
std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
});
return slice_tasks;
}

Expand Down Expand Up @@ -318,6 +311,21 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
return nullptr;
}

std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size)
{
max_slice_size = std::max(1024ul, max_slice_size);
auto const n_slices = util::div_rounding_up_safe(size, max_slice_size);
std::vector<file_io_slice> slices;
slices.reserve(n_slices);
std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable {
auto const slice_offset = idx++ * max_slice_size;
auto const slice_size = std::min(size - slice_offset, max_slice_size);
return file_io_slice{slice_offset, slice_size};
});

return slices;
}

} // namespace detail
} // namespace io
} // namespace cudf
17 changes: 16 additions & 1 deletion cpp/src/io/utilities/file_io_utilities.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -291,6 +291,21 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
*/
std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath);

/**
* @brief Byte range to be read/written in a single operation.
*/
struct file_io_slice {
size_t offset;
size_t size;
};

/**
* @brief Split the total number of bytes to read/write into slices to enable parallel IO.
*
* If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse.
*/
std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);

} // namespace detail
} // namespace io
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ ConfigureTest(
ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)

ConfigureTest(CSV_TEST io/csv_test.cpp)
ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
ConfigureTest(ORC_TEST io/orc_test.cpp)
ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
ConfigureTest(JSON_TEST io/json_test.cpp)
Expand Down
46 changes: 46 additions & 0 deletions cpp/tests/io/file_io_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/cudf_gtest.hpp>

#include <src/io/utilities/file_io_utilities.hpp>

#include <type_traits>

// Base test fixture for tests
struct CuFileIOTest : public cudf::test::BaseFixture {
};

TEST_F(CuFileIOTest, SliceSize)
{
std::vector<std::pair<size_t, size_t>> test_cases{
{1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}};
for (auto const& test_case : test_cases) {
auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second);
if (slices.empty()) {
ASSERT_EQ(test_case.first, 0);
} else {
ASSERT_EQ(slices.front().offset, 0);
ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first);
for (auto i = 1u; i < slices.size(); ++i) {
ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size);
}
}
}
}

CUDF_TEST_PROGRAM_MAIN()
9 changes: 6 additions & 3 deletions java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) {
(ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]);
}

// The sentinel value of unknown precision (default value)
public static int UNKNOWN_PRECISION = -1;

/**
* Constructor used for list
*/
Expand Down Expand Up @@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision,

protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
boolean isNullable) {
return new ColumnWriterOptions(name, isInt96, 0, isNullable);
return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable);
}

/**
Expand Down Expand Up @@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,

public ColumnWriterOptions(String columnName, boolean isNullable) {
this.isTimestampTypeInt96 = false;
this.precision = 0;
this.precision = UNKNOWN_PRECISION;
this.isNullable = isNullable;
this.columnName = columnName;
}
Expand Down
13 changes: 8 additions & 5 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
int write_index = 0;
for (int i = 0; i < num_children; i++, write_index++) {
cudf::io::column_in_metadata child;
child.set_name(col_names[read_index])
.set_decimal_precision(precisions[read_index])
.set_nullability(nullability[read_index]);
child.set_name(col_names[read_index]).set_nullability(nullability[read_index]);
if (precisions[read_index] > -1) {
child.set_decimal_precision(precisions[read_index]);
}
if (!is_int96.is_null()) {
child.set_int96_timestamps(is_int96[read_index]);
}
Expand Down Expand Up @@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
metadata.column_metadata[write_index]
.set_name(cpp_names[read_index])
.set_nullability(col_nullability[read_index])
.set_decimal_precision(precisions[read_index]);
.set_nullability(col_nullability[read_index]);
if (precisions[read_index] > -1) {
metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]);
}
if (!is_int96.is_null()) {
metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]);
}
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import pickle
from functools import cached_property
from typing import Any, Set

import pandas as pd
Expand Down Expand Up @@ -31,7 +32,6 @@
is_mixed_with_object_dtype,
numeric_normalize_types,
)
from cudf.utils.utils import cached_property


class BaseIndex(Serializable):
Expand Down
Loading

0 comments on commit 42515d2

Please sign in to comment.