Skip to content

Commit

Permalink
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
Browse files Browse the repository at this point in the history
…f-io-writers
  • Loading branch information
lithomas1 committed Jun 7, 2024
2 parents 15daaaa + 9bd16bb commit 72204f1
Show file tree
Hide file tree
Showing 50 changed files with 1,689 additions and 574 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/cuda11.8-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda11.8-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.2-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.2-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ java/ @rapidsai/cudf-java-codeowners
/.pre-commit-config.yaml @rapidsai/ci-codeowners

#packaging code owners
/.devcontainers/ @rapidsai/packaging-codeowners
/.devcontainer/ @rapidsai/packaging-codeowners
/conda/ @rapidsai/packaging-codeowners
/dependencies.yaml @rapidsai/packaging-codeowners
/build.sh @rapidsai/packaging-codeowners
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ repos:
- id: rapids-dependency-file-generator
args: ["--clean"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.3
rev: v0.4.8
hooks:
- id: ruff
files: python/.*$
Expand Down
6 changes: 0 additions & 6 deletions ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,13 @@ pushd docs/cudf
make dirhtml
mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
make text
mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
popd

rapids-logger "Build dask-cuDF Sphinx docs"
pushd docs/dask_cudf
make dirhtml
mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
make text
mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
popd

rapids-upload-docs
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
*/
__device__ inline int64_t operator[](size_type idx) const
{
void const* tp = p_ + (idx * this->width_);
void const* tp = p_ + (static_cast<int64_t>(idx) * this->width_);
return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
: *static_cast<int64_t const*>(tp);
}
Expand All @@ -79,7 +79,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
"Unexpected offsets type");
#endif
p_ += (this->width_ * offset);
p_ += (this->width_ * static_cast<int64_t>(offset));
}

protected:
Expand Down Expand Up @@ -121,7 +121,7 @@ struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
__device__ inline output_offsetalator const operator[](size_type idx) const
{
output_offsetalator tmp{*this};
tmp.p_ += (idx * this->width_);
tmp.p_ += (static_cast<int64_t>(idx) * this->width_);
return tmp;
}

Expand Down
56 changes: 32 additions & 24 deletions cpp/src/io/utilities/data_casting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/detail/strings_children.cuh>
Expand Down Expand Up @@ -417,6 +418,7 @@ struct bitfield_block {
* @param null_mask Null mask
* @param null_count_data pointer to store null count
* @param options Settings for controlling string processing behavior
* @param d_sizes Output size of each row
* @param d_offsets Offsets to identify where to store the results for each string
* @param d_chars Character array to store the characters of strings
*/
Expand All @@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
bitmask_type* null_mask,
size_type* null_count_data,
cudf::io::parse_options_view const options,
size_type* d_offsets,
size_type* d_sizes,
cudf::detail::input_offsetalator d_offsets,
char* d_chars)
{
constexpr auto BLOCK_SIZE =
Expand Down Expand Up @@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
istring = get_next_string()) {
// skip nulls
if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
if (!d_chars && lane == 0) d_offsets[istring] = 0;
if (!d_chars && lane == 0) { d_sizes[istring] = 0; }
continue; // gride-stride return;
}

Expand All @@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
if (lane == 0) {
clear_bit(null_mask, istring);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[istring] = 0;
if (!d_chars) { d_sizes[istring] = 0; }
}
continue; // gride-stride return;
}
Expand All @@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
// Copy literal/numeric value
if (not is_string_value) {
if (!d_chars) {
if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
if (lane == 0) { d_sizes[istring] = in_end - in_begin; }
} else {
for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
char_index += BLOCK_SIZE) {
Expand Down Expand Up @@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
clear_bit(null_mask, istring);
atomicAdd(null_count_data, 1);
}
last_offset = 0;
d_offsets[istring] = 0;
last_offset = 0;
d_sizes[istring] = 0;
}
if constexpr (!is_warp) { __syncthreads(); }
break; // gride-stride return;
Expand Down Expand Up @@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
}
}
} // char for-loop
if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; }
} // grid-stride for-loop
}

Expand All @@ -739,13 +742,14 @@ struct string_parse {
bitmask_type* null_mask;
size_type* null_count_data;
cudf::io::parse_options_view const options;
size_type* d_offsets{};
size_type* d_sizes{};
cudf::detail::input_offsetalator d_offsets;
char* d_chars{};

__device__ void operator()(size_type idx)
{
if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const in_begin = str_tuples[idx].first;
Expand All @@ -761,7 +765,7 @@ struct string_parse {
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
}
Expand All @@ -773,9 +777,9 @@ struct string_parse {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
}
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
if (!d_chars) { d_sizes[idx] = str_process_info.bytes; }
}
}
};
Expand Down Expand Up @@ -811,13 +815,12 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
size_type{0},
thrust::maximum<size_type>{});

auto offsets = cudf::make_numeric_column(
data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();
auto sizes = rmm::device_uvector<size_type>(col_size, stream);
auto d_sizes = sizes.data();
auto null_count_data = d_null_count.data();

auto single_thread_fn = string_parse<decltype(str_tuples)>{
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_sizes};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
Expand All @@ -838,7 +841,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_offsets,
d_sizes,
cudf::detail::input_offsetalator{},
nullptr);
}

Expand All @@ -853,20 +857,22 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_offsets,
d_sizes,
cudf::detail::input_offsetalator{},
nullptr);
}
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

auto [offsets, bytes] =
cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// CHARS column
rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_chars = chars.data();

single_thread_fn.d_chars = d_chars;
single_thread_fn.d_chars = d_chars;
single_thread_fn.d_offsets = d_offsets;

thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
Expand All @@ -882,6 +888,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_sizes,
d_offsets,
d_chars);
}
Expand All @@ -897,6 +904,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_sizes,
d_offsets,
d_chars);
}
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/quantiles/quantiles.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <thrust/iterator/transform_iterator.h>

#include <memory>
#include <stdexcept>
#include <vector>

namespace cudf {
Expand Down Expand Up @@ -78,7 +79,8 @@ std::unique_ptr<table> quantiles(table_view const& input,

CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER ||
interp == interpolation::NEAREST,
"multi-column quantiles require a non-arithmetic interpolation strategy.");
"multi-column quantiles require a non-arithmetic interpolation strategy.",
std::invalid_argument);

CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row.");

Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ ConfigureTest(
large_strings/concatenate_tests.cpp
large_strings/case_tests.cpp
large_strings/large_strings_fixture.cpp
large_strings/many_strings_tests.cpp
large_strings/merge_tests.cpp
large_strings/parquet_tests.cpp
large_strings/reshape_tests.cpp
Expand Down
1 change: 0 additions & 1 deletion cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes)
EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type";
i++;
}
std::cout << "\n";
};

// json
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/large_strings/large_strings_fixture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,17 @@ cudf::column_view StringsLargeTest::long_column()
return g_ls_data->get_column(name);
}

cudf::column_view StringsLargeTest::very_long_column()
{
std::string name("long2");
if (!g_ls_data->has_key(name)) {
auto itr = thrust::constant_iterator<std::string_view>("12345");
auto input = cudf::test::strings_column_wrapper(itr, itr + 30'000'000);
g_ls_data->add_column(name, input.release());
}
return g_ls_data->get_column(name);
}

std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
{
CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/large_strings/large_strings_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,25 @@ class LargeStringsData;
struct StringsLargeTest : public cudf::test::BaseFixture {
/**
* @brief Returns a column of long strings
*
* This returns 8 rows of 400 bytes
*/
cudf::column_view wide_column();

/**
* @brief Returns a long column of strings
*
* This returns 5 million rows of 50 bytes
*/
cudf::column_view long_column();

/**
* @brief Returns a very long column of strings
*
* This returns 30 million rows of 5 bytes
*/
cudf::column_view very_long_column();

large_strings_enabler g_ls_enabler;
static LargeStringsData* g_ls_data;

Expand Down
Loading

0 comments on commit 72204f1

Please sign in to comment.