Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf resolve nvcc 11.0 compiler crashes during codegen #9028

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ add_library(cudf
src/interop/dlpack.cpp
src/interop/from_arrow.cu
src/interop/to_arrow.cu
src/interop/detail/arrow_allocator.cpp
src/io/avro/avro.cpp
src/io/avro/avro_gpu.cu
src/io/avro/reader_impl.cu
Expand Down
51 changes: 51 additions & 0 deletions cpp/src/interop/detail/arrow_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/interop.hpp>

namespace cudf {
namespace detail {

std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr)
{
/*
nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
and `ValueOrDie` are used inside a CUDA compilation unit.

To work around this issue we compile an allocation shim in C++ and use
that from our cuda sources
*/
auto result = arrow::AllocateBuffer(size, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
return std::move(result).ValueOrDie();
}

std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr)
{
/*
nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
and `ValueOrDie` are used inside a CUDA compilation unit.

To work around this issue we compile an allocation shim in C++ and use
that from our cuda sources
*/
auto result = arrow::AllocateBitmap(size, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
return std::move(result).ValueOrDie();
}

} // namespace detail
} // namespace cudf
31 changes: 31 additions & 0 deletions cpp/src/interop/detail/arrow_allocator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/interop.hpp>

namespace cudf {
namespace detail {

// unique_ptr because that is what AllocateBuffer returns
std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr);

// shared_ptr because that is what AllocateBitmap returns
std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr);

} // namespace detail
} // namespace cudf
43 changes: 14 additions & 29 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

#include "detail/arrow_allocator.hpp"

namespace cudf {
namespace detail {
namespace {
Expand All @@ -48,18 +50,15 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
{
const int64_t data_size_in_bytes = sizeof(T) * input_view.size();

auto result = arrow::AllocateBuffer(data_size_in_bytes, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");

std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);

CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
input_view.data<T>(),
data_size_in_bytes,
cudaMemcpyDeviceToHost,
stream.value()));

return data_buffer;
return std::move(data_buffer);
}

/**
Expand All @@ -72,9 +71,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());

if (input_view.has_nulls()) {
auto result = arrow::AllocateBitmap(static_cast<int64_t>(input_view.size()), ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for mask");
std::shared_ptr<arrow::Buffer> mask_buffer = std::move(result.ValueOrDie());
auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
CUDA_TRY(cudaMemcpyAsync(
mask_buffer->mutable_data(),
(input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
Expand Down Expand Up @@ -163,10 +160,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
});

auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
auto result = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");

std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);

CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
buf.data(),
Expand All @@ -176,7 +170,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(

auto type = arrow::decimal(18, -input.type().scale());
auto mask = fetch_mask_buffer(input, ar_mr, stream);
auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, data_buffer};
auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
auto data = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);

return std::make_shared<arrow::Decimal128Array>(data);
Expand All @@ -191,10 +185,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
{
auto bitmask = bools_to_mask(input, stream);

auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");

std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);

CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
bitmask.first->data(),
Expand All @@ -203,7 +194,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
stream.value()));
return to_arrow_array(id,
static_cast<int64_t>(input.size()),
data_buffer,
std::move(data_buffer),
fetch_mask_buffer(input, ar_mr, stream),
static_cast<int64_t>(input.null_count()));
}
Expand All @@ -225,19 +216,13 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
auto child_arrays = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
if (child_arrays.empty()) {
arrow::Result<std::unique_ptr<arrow::Buffer>> result;

// Empty string will have only one value in offset of 4 bytes
result = arrow::AllocateBuffer(4, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
std::shared_ptr<arrow::Buffer> tmp_offset_buffer = std::move(result.ValueOrDie());
tmp_offset_buffer->mutable_data()[0] = 0;

result = arrow::AllocateBuffer(0, ar_mr);
CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
std::shared_ptr<arrow::Buffer> tmp_data_buffer = std::move(result.ValueOrDie());
auto tmp_offset_buffer = allocate_arrow_buffer(4, ar_mr);
auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr);
tmp_offset_buffer->mutable_data()[0] = 0;

return std::make_shared<arrow::StringArray>(0, tmp_offset_buffer, tmp_data_buffer);
return std::make_shared<arrow::StringArray>(
0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
}
auto offset_buffer = child_arrays[0]->data()->buffers[1];
auto data_buffer = child_arrays[1]->data()->buffers[1];
Expand Down