diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp index 67ea6129a74..21e5db8ca8f 100644 --- a/cpp/benchmarks/column/concatenate.cpp +++ b/cpp/benchmarks/column/concatenate.cpp @@ -45,7 +45,7 @@ static void BM_concatenate(benchmark::State& state) auto input_columns = input->view(); std::vector column_views(input_columns.begin(), input_columns.end()); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); @@ -87,7 +87,7 @@ static void BM_concatenate_tables(benchmark::State& state) return table->view(); }); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); @@ -146,7 +146,7 @@ static void BM_concatenate_strings(benchmark::State& state) return static_cast(col); }); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 5df77ac4319..e846317f472 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -139,18 +139,18 @@ void generate_input_tables(key_type* const build_tbl, // Maximize exposed parallelism while minimizing storage for curand state int num_blocks_init_build_tbl{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks_init_build_tbl, init_build_tbl, block_size, 0)); int num_blocks_init_probe_tbl{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks_init_probe_tbl, init_probe_tbl, block_size, 0)); int dev_id{-1}; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); + CUDA_TRY(cudaGetDevice(&dev_id)); int num_sms{-1}; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); + CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); const int num_states = num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size; @@ -158,12 +158,12 @@ void generate_input_tables(key_type* const build_tbl, init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); init_build_tbl<<>>( build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); auto const rand_max = std::numeric_limits::max(); @@ -177,5 +177,5 @@ void generate_input_tables(key_type* const build_tbl, devStates.data(), num_states); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); } diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index 6ff2543cf7d..27339248968 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -116,7 +116,7 @@ static void BM_join(state_type& state, Join JoinFunc) auto build_payload_column = cudf::sequence(build_table_size, *init); auto probe_payload_column = cudf::sequence(probe_table_size, *init); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); cudf::table_view build_table({build_key_column->view(), *build_payload_column}); cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column}); diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index bbf90e6f68a..bd8a4d1de76 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,32 +29,32 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state, // flush all of L2$ if (flush_l2_cache) { int current_device = 0; - CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); + CUDA_TRY(cudaGetDevice(¤t_device)); int l2_cache_bytes = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); + CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); if (l2_cache_bytes > 0) { const int memset_value = 0; rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDF_CUDA_TRY( + CUDA_TRY( cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); } } - CUDF_CUDA_TRY(cudaEventCreate(&start)); - CUDF_CUDA_TRY(cudaEventCreate(&stop)); - CUDF_CUDA_TRY(cudaEventRecord(start, stream.value())); + CUDA_TRY(cudaEventCreate(&start)); + CUDA_TRY(cudaEventCreate(&stop)); + CUDA_TRY(cudaEventRecord(start, stream.value())); } cuda_event_timer::~cuda_event_timer() { - CUDF_CUDA_TRY(cudaEventRecord(stop, stream.value())); - CUDF_CUDA_TRY(cudaEventSynchronize(stop)); + CUDA_TRY(cudaEventRecord(stop, stream.value())); + CUDA_TRY(cudaEventSynchronize(stop)); float milliseconds = 0.0f; - CUDF_CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop)); + CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop)); p_state->SetIterationTime(milliseconds / (1000.0f)); - CUDF_CUDA_TRY(cudaEventDestroy(start)); - CUDF_CUDA_TRY(cudaEventDestroy(stop)); + CUDA_TRY(cudaEventDestroy(start)); + CUDA_TRY(cudaEventDestroy(stop)); } diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index aba78dad3fe..6ab6f9a2095 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -196,13 +196,13 @@ void type_dispatcher_benchmark(::benchmark::State& state) rmm::device_uvector d_vec(n_cols, rmm::cuda_stream_default); if (dispatching_type == NO_DISPATCHING) { - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice)); } // Warm up launch_kernel(source_table, d_vec.data(), work_per_thread); - CUDF_CUDA_TRY(cudaDeviceSynchronize()); + CUDA_TRY(cudaDeviceSynchronize()); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 1599c81cbe5..eeebe38d873 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -373,7 +373,7 @@ namespace detail{ void external_function(..., rmm::cuda_stream_view stream){ // Implementation uses the stream with async APIs. rmm::device_buffer buff(...,stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value())); + CUDA_TRY(cudaMemcpyAsync(...,stream.value())); kernel<<<..., stream>>>(...); thrust::algorithm(rmm::exec_policy(stream), ...); } @@ -777,7 +777,7 @@ CUDF_FAIL("This code path should not be reached."); ### CUDA Error Checking -Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This +Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The thrown exception includes a description of the CUDA error code in its `what()` message. diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index ecaa4a30cf0..0087dd1b173 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -254,10 +254,10 @@ struct scatter_gather_functor { if (output.nullable()) { // Have to initialize the output mask to all zeros because we may update // it with atomicOr(). - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), - 0, - cudf::bitmask_allocation_size_bytes(output.size()), - stream.value())); + CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), + 0, + cudf::bitmask_allocation_size_bytes(output.size()), + stream.value())); } auto output_device_view = cudf::mutable_column_device_view::create(output, stream); @@ -344,7 +344,7 @@ std::unique_ptr copy_if( // initialize just the first element of block_offsets to 0 since the InclusiveSum below // starts at the second element. - CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); + CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); // 2. Find the offset for each block's output using a scan of block counts if (grid.num_blocks > 1) { @@ -370,7 +370,7 @@ std::unique_ptr
copy_if( // As it is InclusiveSum, last value in block_offsets will be output_size // unless num_blocks == 1, in which case output_size is just block_counts[0] cudf::size_type output_size{0}; - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( &output_size, grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), sizeof(cudf::size_type), diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 6703db305a1..ac59b429a2c 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin, nullptr); } - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); } /** diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 49a406ab5f0..56c0289dc0a 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,11 +49,11 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); T result; - CUDF_CUDA_TRY(cudaMemcpyAsync(&result, - col_view.data() + element_index, - sizeof(T), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(&result, + col_view.data() + element_index, + sizeof(T), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); return result; } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 7aec56fdc51..be010689847 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -170,20 +170,20 @@ size_type inplace_bitmask_binop( rmm::device_uvector d_masks(masks.size(), stream, mr); rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(), - masks_begin_bits.data(), - masks_begin_bits.size_bytes(), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(), + masks_begin_bits.data(), + masks_begin_bits.size_bytes(), + cudaMemcpyHostToDevice, + stream.value())); auto constexpr block_size = 256; cudf::detail::grid_1d config(dest_mask.size(), block_size); offset_bitmask_binop <<>>( op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data()); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return d_counter.value(stream); } @@ -298,25 +298,27 @@ rmm::device_uvector segmented_count_bits(bitmask_type const* bitmask, // Allocate temporary memory. size_t temp_storage_bytes{0}; - CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, - temp_storage_bytes, - num_set_bits_in_word, - d_bit_counts.begin(), - num_ranges, - first_word_indices, - last_word_indices, - stream.value())); + CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, + temp_storage_bytes, + num_set_bits_in_word, + d_bit_counts.begin(), + num_ranges, + first_word_indices, + last_word_indices, + stream.value())); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); // Perform segmented reduction. - CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(), - temp_storage_bytes, - num_set_bits_in_word, - d_bit_counts.begin(), - num_ranges, - first_word_indices, - last_word_indices, - stream.value())); + CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(), + temp_storage_bytes, + num_set_bits_in_word, + d_bit_counts.begin(), + num_ranges, + first_word_indices, + last_word_indices, + stream.value())); + + CHECK_CUDA(stream.value()); // Adjust counts in segment boundaries (if segments are not word-aligned). constexpr size_type block_size{256}; @@ -348,7 +350,7 @@ rmm::device_uvector segmented_count_bits(bitmask_type const* bitmask, }); } - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return d_bit_counts; } diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 23d0ff26e0f..a00bd64caa3 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -127,12 +127,12 @@ cudf::size_type elements_per_thread(Kernel kernel, // calculate theoretical occupancy int max_blocks = 0; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0)); + CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0)); int device = 0; - CUDF_CUDA_TRY(cudaGetDevice(&device)); + CUDA_TRY(cudaGetDevice(&device)); int num_sms = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); + CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); int per_thread = total_size / (max_blocks * num_sms * block_size); return std::clamp(per_thread, 1, max_per_thread); } diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 63ac48f6060..e3f44ce0bee 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -53,7 +53,7 @@ rmm::device_uvector make_zeroed_device_uvector_async( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(size, stream, mr); - CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value())); + CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value())); return ret; } @@ -75,7 +75,7 @@ rmm::device_uvector make_zeroed_device_uvector_sync( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(size, stream, mr); - CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value())); + CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value())); stream.synchronize(); return ret; } @@ -99,11 +99,11 @@ rmm::device_uvector make_device_uvector_async( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(source_data.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(ret.data(), + source_data.data(), + source_data.size() * sizeof(T), + cudaMemcpyDefault, + stream.value())); return ret; } @@ -151,11 +151,11 @@ rmm::device_uvector make_device_uvector_async( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(source_data.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(ret.data(), + source_data.data(), + source_data.size() * sizeof(T), + cudaMemcpyDefault, + stream.value())); return ret; } @@ -286,7 +286,7 @@ template OutContainer make_vector_async(device_span v, rmm::cuda_stream_view stream) { OutContainer result(v.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDeviceToHost, stream.value())); return result; } diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 56db3fd6216..ffe159b59dc 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -78,14 +78,14 @@ class istream_data_chunk_reader : public data_chunk_reader { { // create an event to track the completion of the last device-to-host copy. for (std::size_t i = 0; i < _tickets.size(); i++) { - CUDF_CUDA_TRY(cudaEventCreate(&(_tickets[i].event))); + CUDA_TRY(cudaEventCreate(&(_tickets[i].event))); } } ~istream_data_chunk_reader() { for (std::size_t i = 0; i < _tickets.size(); i++) { - CUDF_CUDA_TRY(cudaEventDestroy(_tickets[i].event)); + CUDA_TRY(cudaEventDestroy(_tickets[i].event)); } } @@ -101,7 +101,7 @@ class istream_data_chunk_reader : public data_chunk_reader { _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size(); // synchronize on the last host-to-device copy, so we don't clobber the host buffer. - CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); + CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } @@ -116,7 +116,7 @@ class istream_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( // + CUDA_TRY(cudaMemcpyAsync( // chunk.data(), h_ticket.buffer.data(), read_size, @@ -124,7 +124,7 @@ class istream_data_chunk_reader : public data_chunk_reader { stream.value())); // record the host-to-device copy. - CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); + CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); // return the view over device memory so it can be processed. return std::make_unique(std::move(chunk)); diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index b9ea2d9ecff..4b036fb7f0e 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -67,7 +67,7 @@ std::unique_ptr make_offsets_child_column( // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first // offset values to zero manually. thrust::inclusive_scan(rmm::exec_policy(stream), begin, end, d_offsets + 1); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value())); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value())); return offsets_column; } diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index a486a5a765c..27ee5cf95cd 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -99,8 +99,7 @@ CUDF_HOST_DEVICE inline string_view string_view::max() #if defined(__CUDA_ARCH__) psentinel = &cudf::strings::detail::max_string_sentinel[0]; #else - CUDF_CUDA_TRY( - cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel)); + CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel)); #endif return string_view(psentinel, 4); } diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 8d08a3fd0b0..3ed18099463 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -145,7 +145,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st auto d_columns = detail::child_columns_to_device_array( source_view.begin(), source_view.end(), h_ptr, d_ptr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); + CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); stream.synchronize(); return std::make_tuple(std::move(descendant_storage), d_columns); } diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 8be1a7e3a32..2036723a6ed 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -1,19 +1,3 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #pragma once #include @@ -115,7 +99,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an * exception detailing the CUDA error that occurred */ -#define CUDF_CUDA_TRY(call) \ +#define CUDA_TRY(call) \ do { \ cudaError_t const status = (call); \ if (cudaSuccess != status) { \ @@ -138,12 +122,12 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l * asynchronous kernel launch. */ #ifndef NDEBUG -#define CUDF_CHECK_CUDA(stream) \ - do { \ - CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); \ - CUDF_CUDA_TRY(cudaPeekAtLastError()); \ +#define CHECK_CUDA(stream) \ + do { \ + CUDA_TRY(cudaStreamSynchronize(stream)); \ + CUDA_TRY(cudaPeekAtLastError()); \ } while (0); #else -#define CUDF_CHECK_CUDA(stream) CUDF_CUDA_TRY(cudaPeekAtLastError()); +#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); #endif /** @} */ diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index b28ed4f70fa..4c2d4d429eb 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -183,8 +183,7 @@ template ()>* = nullptr std::pair, std::vector> to_host(column_view c) { thrust::host_vector host_data(c.size()); - CUDF_CUDA_TRY( - cudaMemcpy(host_data.data(), c.data(), c.size() * sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(host_data.data(), c.data(), c.size() * sizeof(T), cudaMemcpyDeviceToHost)); return {host_data, bitmask_to_host(c)}; } @@ -207,7 +206,7 @@ std::pair, std::vector> to_host(column_view auto host_rep_types = thrust::host_vector(c.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( host_rep_types.data(), c.begin(), c.size() * sizeof(Rep), cudaMemcpyDeviceToHost)); auto to_fp = [&](Rep val) { return T{scaled_integer{val, scale_type{c.type().scale()}}}; }; diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index ec41fbb8883..9b3e33f491e 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -265,7 +265,7 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) { int block_size; int min_grid_size; - CUDF_CUDA_TRY( + CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); // 2 elements per thread. const int grid_size = util::div_rounding_up_safe(size, 2 * block_size); diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp index 4a94ec9759c..47870e01567 100644 --- a/cpp/src/bitmask/is_element_valid.cpp +++ b/cpp/src/bitmask/is_element_valid.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,11 +34,11 @@ bool is_element_valid_sync(column_view const& col_view, bitmask_type word; // null_mask() returns device ptr to bitmask without offset size_type index = element_index + col_view.offset(); - CUDF_CUDA_TRY(cudaMemcpyAsync(&word, - col_view.null_mask() + word_index(index), - sizeof(bitmask_type), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(&word, + col_view.null_mask() + word_index(index), + sizeof(bitmask_type), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); return static_cast(word & (bitmask_type{1} << intra_word_index(index))); } diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 756cf3421c9..d1107ad3cfd 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,7 +90,7 @@ rmm::device_buffer create_null_mask(size_type size, if (state != mask_state::UNINITIALIZED) { uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00; - CUDF_CUDA_TRY(cudaMemsetAsync( + CUDA_TRY(cudaMemsetAsync( static_cast(mask.data()), fill_value, mask_size, stream.value())); } @@ -146,7 +146,7 @@ void set_null_mask(bitmask_type* bitmask, cudf::detail::grid_1d config(number_of_mask_words, 256); set_null_mask_kernel<<>>( static_cast(bitmask), begin_bit, end_bit, valid, number_of_mask_words); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); } } @@ -220,7 +220,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask, cudf::detail::grid_1d config(number_of_mask_words, 256); copy_offset_bitmask<<>>( static_cast(dest_mask.data()), mask, begin_bit, end_bit, number_of_mask_words); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); } return dest_mask; } diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index fc244521617..dd1803f4b90 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -77,11 +77,11 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter}; // copy the CPU memory with all the children into device memory - CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(), - staging_buffer.data(), - descendant_storage->size(), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(), + staging_buffer.data(), + descendant_storage->size(), + cudaMemcpyDefault, + stream.value())); stream.synchronize(); diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 514374d450d..46470e69611 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1046,11 +1046,11 @@ std::vector contiguous_split(cudf::table_view const& input, setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info); // HtoD indices and source buf info to device - CUDF_CUDA_TRY(cudaMemcpyAsync(d_indices, - h_indices, - indices_size + src_buf_info_size, - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(d_indices, + h_indices, + indices_size + src_buf_info_size, + cudaMemcpyHostToDevice, + stream.value())); // packed block of memory 2. partition buffer sizes and dst_buf_info structs std::size_t const buf_sizes_size = @@ -1180,11 +1180,11 @@ std::vector contiguous_split(cudf::table_view const& input, } // DtoH buf sizes and col info back to the host - CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes, - d_buf_sizes, - buf_sizes_size + dst_buf_info_size, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(h_buf_sizes, + d_buf_sizes, + buf_sizes_size + dst_buf_info_size, + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); // allocate output partition buffers @@ -1224,14 +1224,14 @@ std::vector contiguous_split(cudf::table_view const& input, }); // HtoD src and dest buffers - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value())); // perform the copy. copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream); // DtoH dst info (to retrieve null counts) - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index 76f3fba4689..0ae0baa9908 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -432,11 +432,11 @@ class concurrent_unordered_map { m_hashtbl_values = m_allocator.allocate(m_capacity, stream); } - CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values, - other.m_hashtbl_values, - m_capacity * sizeof(value_type), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values, + other.m_hashtbl_values, + m_capacity * sizeof(value_type), + cudaMemcpyDefault, + stream.value())); } void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default) @@ -460,10 +460,10 @@ class concurrent_unordered_map { cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - CUDF_CUDA_TRY(cudaMemPrefetchAsync( + CUDA_TRY(cudaMemPrefetchAsync( m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); } - CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value())); + CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value())); } /** @@ -532,8 +532,8 @@ class concurrent_unordered_map { if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { int dev_id = 0; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); - CUDF_CUDA_TRY(cudaMemPrefetchAsync( + CUDA_TRY(cudaGetDevice(&dev_id)); + CUDA_TRY(cudaMemPrefetchAsync( m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); } } @@ -543,6 +543,6 @@ class concurrent_unordered_map { m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); } - CUDF_CHECK_CUDA(stream.value()); + CUDA_TRY(cudaGetLastError()); } }; diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index e5da4794ca3..01ca32e6a2f 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -144,7 +144,7 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, // Make sure the current device ID matches the Tensor's device ID if (tensor.device.device_type != kDLCPU) { int device_id = 0; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDA_TRY(cudaGetDevice(&device_id)); CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device"); } @@ -184,11 +184,11 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, for (auto& col : columns) { col = make_numeric_column(dtype, num_rows, mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head(), - reinterpret_cast(tensor_data), - bytes, - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head(), + reinterpret_cast(tensor_data), + bytes, + cudaMemcpyDefault, + stream.value())); tensor_data += col_stride; } @@ -234,7 +234,7 @@ DLManagedTensor* to_dlpack(table_view const& input, tensor.strides[1] = num_rows; } - CUDF_CUDA_TRY(cudaGetDevice(&tensor.device.device_id)); + CUDA_TRY(cudaGetDevice(&tensor.device.device_id)); tensor.device.device_type = kDLCUDA; // If there is only one column, then a 1D tensor can just copy the pointer @@ -254,11 +254,11 @@ DLManagedTensor* to_dlpack(table_view const& input, auto tensor_data = reinterpret_cast(tensor.data); for (auto const& col : input) { - CUDF_CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(tensor_data), - get_column_data(col), - stride_bytes, - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(reinterpret_cast(tensor_data), + get_column_data(col), + stride_bytes, + cudaMemcpyDefault, + stream.value())); tensor_data += stride_bytes; } diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 6c5cd56d2a7..99b657fb9d5 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -108,11 +108,11 @@ struct dispatch_to_cudf_column { stream, mr); auto mask_buffer = array.null_bitmap(); - CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), - reinterpret_cast(mask_buffer->address()), - array.null_bitmap()->size(), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(mask->data(), + reinterpret_cast(mask_buffer->address()), + array.null_bitmap()->size(), + cudaMemcpyDefault, + stream.value())); return mask; } @@ -135,7 +135,7 @@ struct dispatch_to_cudf_column { auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_column_view = col->mutable_view(); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( mutable_column_view.data(), reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(T), sizeof(T) * num_rows, @@ -191,7 +191,7 @@ std::unique_ptr dispatch_to_cudf_column::operator() auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_column_view = col->mutable_view(); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( mutable_column_view.data(), reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(DeviceType), sizeof(DeviceType) * num_rows, @@ -227,11 +227,11 @@ std::unique_ptr dispatch_to_cudf_column::operator()( { auto data_buffer = array.data()->buffers[1]; auto data = rmm::device_buffer(data_buffer->size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), - reinterpret_cast(data_buffer->address()), - data_buffer->size(), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(data.data(), + reinterpret_cast(data_buffer->address()), + data_buffer->size(), + cudaMemcpyDefault, + stream.value())); auto out_col = mask_to_bools(static_cast(data.data()), array.offset(), array.offset() + array.length(), diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 517a83c716e..c7409978bb2 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -53,11 +53,11 @@ std::shared_ptr fetch_data_buffer(column_view input_view, auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - input_view.data(), - data_size_in_bytes, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + input_view.data(), + data_size_in_bytes, + cudaMemcpyDeviceToHost, + stream.value())); return std::move(data_buffer); } @@ -73,7 +73,7 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, if (input_view.has_nulls()) { auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( mask_buffer->mutable_data(), (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), mask_size_in_bytes, @@ -163,11 +163,11 @@ std::shared_ptr dispatch_to_arrow::operator()( auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - buf.data(), - buf_size_in_bytes, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + buf.data(), + buf_size_in_bytes, + cudaMemcpyDeviceToHost, + stream.value())); auto type = arrow::decimal(18, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); @@ -197,11 +197,11 @@ std::shared_ptr dispatch_to_arrow::operator() auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - buf.data(), - buf_size_in_bytes, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + buf.data(), + buf_size_in_bytes, + cudaMemcpyDeviceToHost, + stream.value())); auto type = arrow::decimal(18, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); @@ -222,11 +222,11 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in auto data_buffer = allocate_arrow_buffer(static_cast(bitmask.first->size()), ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - bitmask.first->data(), - bitmask.first->size(), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + bitmask.first->data(), + bitmask.first->size(), + cudaMemcpyDeviceToHost, + stream.value())); return to_arrow_array(id, static_cast(input.size()), std::move(data_buffer), diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 5885b61b35b..b5b76c2def8 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -192,9 +192,9 @@ rmm::device_buffer decompress_data(datasource& source, for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) { inflate_in.host_to_device(stream); - CUDF_CUDA_TRY( + CUDA_TRY( cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value())); - CUDF_CUDA_TRY(gpuinflate( + CUDA_TRY(gpuinflate( inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream)); inflate_out.device_to_host(stream, true); @@ -424,11 +424,11 @@ std::vector decode_data(metadata& meta, // Copy valid bits that are shared between columns for (size_t i = 0; i < out_buffers.size(); i++) { if (valid_alias[i] != nullptr) { - CUDF_CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(), - valid_alias[i], - out_buffers[i].null_mask_size(), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(), + valid_alias[i], + out_buffers[i].null_mask_size(), + cudaMemcpyHostToDevice, + stream.value())); } } schema_desc.device_to_host(stream, true); diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 631cf19b2aa..b4a42a66133 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2048,7 +2048,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) int sm_count = 0; int dev = 0; uint32_t max_fb_size, min_fb_size, fb_size; - CUDF_CUDA_TRY(cudaGetDevice(&dev)); + CUDA_TRY(cudaGetDevice(&dev)); if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) { // printf("%d SMs on device %d\n", sm_count, dev); max_num_inputs = @@ -2092,14 +2092,14 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, scratch_size = min(scratch_size, (size_t)0xffffffffu); fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf); - CUDF_CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value())); + CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value())); // NOTE: The 128KB dictionary copy can have a relatively large overhead since source isn't // page-locked - CUDF_CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size, - get_brotli_dictionary(), - sizeof(brotli_dictionary_s), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size, + get_brotli_dictionary(), + sizeof(brotli_dictionary_s), + cudaMemcpyHostToDevice, + stream.value())); gpu_debrotli_kernel<<>>( inputs, outputs, scratch_u8, fb_heap_size, count32); #if DUMP_FB_HEAP @@ -2107,7 +2107,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, uint32_t cur = 0; printf("heap dump (%d bytes)\n", fb_heap_size); while (cur < fb_heap_size && !(cur & 3)) { - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); printf("@%d: next = %d, size = %d\n", cur, dump[0], dump[1]); diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index cd070d28f38..ae9738164f3 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -260,11 +260,11 @@ std::pair, selected_rows_offsets> load_data_and_gather auto const previous_data_size = d_data.size(); d_data.resize(target_pos - buffer_pos, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, - data.begin() + buffer_pos + previous_data_size, - target_pos - buffer_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, + data.begin() + buffer_pos + previous_data_size, + target_pos - buffer_pos - previous_data_size, + cudaMemcpyDefault, + stream.value())); // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. @@ -280,11 +280,11 @@ std::pair, selected_rows_offsets> load_data_and_gather range_end, skip_rows, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_ctx.device_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); // Sum up the rows in each character block, selecting the row count that @@ -300,11 +300,11 @@ std::pair, selected_rows_offsets> load_data_and_gather // At least one row in range in this batch all_row_offsets.resize(total_rows - skip_rows, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), - row_ctx.host_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), + row_ctx.host_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyHostToDevice, + stream.value())); // Pass 2: Output row offsets cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), @@ -321,11 +321,11 @@ std::pair, selected_rows_offsets> load_data_and_gather stream); // With byte range, we want to keep only one row out of the specified range if (range_end < data.size()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_ctx.device_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); size_t rows_out_of_range = 0; @@ -370,11 +370,11 @@ std::pair, selected_rows_offsets> load_data_and_gather // Remove header rows and extract header const size_t header_row_index = std::max(header_rows, 1) - 1; if (header_row_index + 1 < row_offsets.size()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_offsets.data() + header_row_index, - 2 * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_offsets.data() + header_row_index, + 2 * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); const auto header_start = buffer_pos + row_ctx[0]; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 2aa93ae4d0f..cb2197cf755 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -378,11 +378,11 @@ void write_chunked(data_sink* out_sink, } else { // copy the bytes to host to write them out thrust::host_vector h_bytes(total_num_bytes); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(), - ptr_all_bytes, - total_num_bytes * sizeof(char), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(h_bytes.data(), + ptr_all_bytes, + total_num_bytes * sizeof(char), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); out_sink->host_write(h_bytes.data(), total_num_bytes); diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 56a00191ae4..d26831b9112 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -689,7 +689,7 @@ void convert_json_to_columns(parse_options_view const& opts, { int block_size; int min_grid_size; - CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( + CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( &min_grid_size, &block_size, convert_data_to_columns_kernel)); const int grid_size = (row_offsets.size() + block_size - 1) / block_size; @@ -703,7 +703,7 @@ void convert_json_to_columns(parse_options_view const& opts, valid_fields, num_valid_fields); - CUDF_CHECK_CUDA(stream.value()); + CUDA_TRY(cudaGetLastError()); } /** @@ -721,7 +721,7 @@ std::vector detect_data_types( { int block_size; int min_grid_size; - CUDF_CUDA_TRY( + CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel)); auto d_column_infos = [&]() { @@ -763,7 +763,7 @@ void collect_keys_info(parse_options_view const& options, { int block_size; int min_grid_size; - CUDF_CUDA_TRY( + CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel)); // Calculate actual block count to use based on records count @@ -772,7 +772,7 @@ void collect_keys_info(parse_options_view const& options, collect_keys_info_kernel<<>>( options, data, row_offsets, keys_cnt, keys_info); - CUDF_CHECK_CUDA(stream.value()); + CUDA_TRY(cudaGetLastError()); } } // namespace gpu diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 20eeec267b1..5ca947f3ee5 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -274,7 +274,7 @@ rmm::device_uvector find_record_starts(json_reader_options const& read // Manually adding an extra row to account for the first row in the file if (reader_opts.get_byte_range_offset() == 0) { find_result_ptr++; - CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value())); + CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value())); } std::vector chars_to_find{'\n'}; @@ -356,18 +356,18 @@ std::pair, col_map_ptr_type> get_column_names_and_map( uint64_t first_row_len = d_data.size(); if (rec_starts.size() > 1) { // Set first_row_len to the offset of the second row, if it exists - CUDF_CUDA_TRY(cudaMemcpyAsync(&first_row_len, - rec_starts.data() + 1, - sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(&first_row_len, + rec_starts.data() + 1, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); } std::vector first_row(first_row_len); - CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(), - d_data.data(), - first_row_len * sizeof(char), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(first_row.data(), + d_data.data(), + first_row_len * sizeof(char), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); // Determine the row format between: diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 83c23774362..059df283c94 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -431,7 +431,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( device_span inflate_out_view(inflate_out.data(), num_compressed_blocks); switch (decompressor->GetKind()) { case orc::ZLIB: - CUDF_CUDA_TRY( + CUDA_TRY( gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream)); break; case orc::SNAPPY: @@ -440,7 +440,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( num_compressed_blocks}; snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream); } else { - CUDF_CUDA_TRY( + CUDA_TRY( gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream)); } break; @@ -449,7 +449,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream); } if (num_uncompressed_blocks > 0) { - CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks( + CUDA_TRY(gpu_copy_uncompressed_blocks( inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream)); } gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream); @@ -1129,7 +1129,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read( offset, len); CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read."); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); stream.synchronize(); } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d0c1cea97a8..30385d395f1 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -706,11 +706,11 @@ std::vector> calculate_aligned_rowgroup_bounds( auto aligned_rgs = hostdevice_2dvector( segmentation.num_rowgroups(), orc_table.num_columns(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(), - segmentation.rowgroups.base_device_ptr(), - aligned_rgs.count() * sizeof(rowgroup_rows), - cudaMemcpyDefault, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(), + segmentation.rowgroups.base_device_ptr(), + aligned_rgs.count() * sizeof(rowgroup_rows), + cudaMemcpyDefault, + stream.value())); auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream); // One thread per column, per stripe @@ -1330,7 +1330,7 @@ std::future writer::impl::write_data_stream(gpu::StripeStream const& strm_ if (out_sink_->is_device_write_preferred(length)) { return out_sink_->device_write_async(stream_in, length, stream); } else { - CUDF_CUDA_TRY( + CUDA_TRY( cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); @@ -1419,10 +1419,10 @@ void pushdown_lists_null_mask(orc_column_view const& col, rmm::cuda_stream_view stream) { // Set all bits - correct unless there's a mismatch between offsets and null mask - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(out_mask.data()), - 255, - out_mask.size() * sizeof(bitmask_type), - stream.value())); + CUDA_TRY(cudaMemsetAsync(static_cast(out_mask.data()), + 255, + out_mask.size() * sizeof(bitmask_type), + stream.value())); // Reset bits where a null list element has rows in the child column thrust::for_each_n( @@ -1946,7 +1946,7 @@ void writer::impl::write(table_view const& table) } else { return pinned_buffer{[](size_t size) { uint8_t* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDA_TRY(cudaMallocHost(&ptr, size)); return ptr; }(max_stream_size), cudaFreeHost}; diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 46b3206f731..df940bb15d3 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1239,24 +1239,24 @@ rmm::device_buffer reader::impl::decompress_page_data( argc++; }); - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos), - inflate_in.host_ptr(start_pos), - sizeof(decltype(inflate_in)::value_type) * (argc - start_pos), - cudaMemcpyHostToDevice, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos), - inflate_out.host_ptr(start_pos), - sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos), + inflate_in.host_ptr(start_pos), + sizeof(decltype(inflate_in)::value_type) * (argc - start_pos), + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos), + inflate_out.host_ptr(start_pos), + sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), + cudaMemcpyHostToDevice, + stream.value())); switch (codec.compression_type) { case parquet::GZIP: - CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - argc - start_pos, - 1, - stream)) + CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos), + inflate_out.device_ptr(start_pos), + argc - start_pos, + 1, + stream)) break; case parquet::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { @@ -1265,27 +1265,27 @@ rmm::device_buffer reader::impl::decompress_page_data( codec.max_decompressed_size, stream); } else { - CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - argc - start_pos, - stream)); + CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos), + inflate_out.device_ptr(start_pos), + argc - start_pos, + stream)); } break; case parquet::BROTLI: - CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - debrotli_scratch.data(), - debrotli_scratch.size(), - argc - start_pos, - stream)); + CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos), + inflate_out.device_ptr(start_pos), + debrotli_scratch.data(), + debrotli_scratch.size(), + argc - start_pos, + stream)); break; default: CUDF_FAIL("Unexpected decompression dispatch"); break; } - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos), - inflate_out.device_ptr(start_pos), - sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos), + inflate_out.device_ptr(start_pos), + sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), + cudaMemcpyDeviceToHost, + stream.value())); } } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index cb1acb4d9ec..a29164ba051 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1076,7 +1076,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks if (nvcomp_integration::is_stable_enabled()) { snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream); } else { - CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream)); + CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream)); } break; default: break; @@ -1089,11 +1089,11 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks GatherPages(d_chunks_in_batch.flat_view(), pages, stream); auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(), - d_chunks_in_batch.data(), - d_chunks_in_batch.flat_view().size_bytes(), - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(), + d_chunks_in_batch.data(), + d_chunks_in_batch.flat_view().size_bytes(), + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); } @@ -1532,28 +1532,28 @@ void writer::impl::write(table_view const& table, std::vector co // we still need to do a (much smaller) memcpy for the statistics. if (ck.ck_stat_size != 0) { column_chunk_meta.statistics_blob.resize(ck.ck_stat_size); - CUDF_CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(), - dev_bfr, - ck.ck_stat_size, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(), + dev_bfr, + ck.ck_stat_size, + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); } } else { if (!host_bfr) { host_bfr = pinned_buffer{[](size_t size) { uint8_t* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDA_TRY(cudaMallocHost(&ptr, size)); return ptr; }(max_chunk_bfr_size), cudaFreeHost}; } // copy the full data - CUDF_CUDA_TRY(cudaMemcpyAsync(host_bfr.get(), - dev_bfr, - ck.ck_stat_size + ck.compressed_size, - cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(host_bfr.get(), + dev_bfr, + ck.ck_stat_size + ck.compressed_size, + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size); if (ck.ck_stat_size != 0) { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index fd510466477..34d8307b024 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -55,7 +55,7 @@ inline rmm::device_buffer create_data(data_type type, std::size_t data_size = size_of(type) * size; rmm::device_buffer data(data_size, stream, mr); - CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); + CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); return data; } diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 5c73cf31428..367bbfcbdfa 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -54,7 +54,7 @@ class hostdevice_vector { : num_elements(initial_size), max_elements(max_size) { if (max_elements != 0) { - CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements)); + CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements)); d_data.resize(sizeof(T) * max_elements, stream); } } @@ -101,14 +101,14 @@ class hostdevice_vector { void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false) { - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream.value())); if (synchronize) { stream.synchronize(); } } void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false) { - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream.value())); if (synchronize) { stream.synchronize(); } } diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index a03789464cc..2db87736848 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -1,19 +1,3 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #include #include #include @@ -122,7 +106,7 @@ cudf::size_type find_all_from_set(device_span data, { int block_size = 0; // suggested thread count to use int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( + CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); const int grid_size = divCeil(data.size(), (size_t)block_size); @@ -147,7 +131,7 @@ cudf::size_type find_all_from_set(host_span data, int block_size = 0; // suggested thread count to use int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( + CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); const size_t chunk_count = divCeil(data.size(), max_chunk_bytes); @@ -159,7 +143,7 @@ cudf::size_type find_all_from_set(host_span data, const int grid_size = divCeil(chunk_bits, block_size); // Copy chunk to device - CUDF_CUDA_TRY( + CUDA_TRY( cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value())); for (char key : keys) { diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 8228ff6da1f..159681eaffc 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -77,9 +77,9 @@ std::filesystem::path get_cache_dir() int device; int cc_major; int cc_minor; - CUDF_CUDA_TRY(cudaGetDevice(&device)); - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device)); + CUDA_TRY(cudaGetDevice(&device)); + CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device)); int cc = cc_major * 10 + cc_minor; kernel_cache_path /= std::to_string(cc); diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 01a94457b69..043c04b409e 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -139,7 +139,7 @@ void materialize_bitmask(column_view const& left_col, } } - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); } struct side_index_generator { @@ -212,7 +212,7 @@ index_vector generate_merged_indices(table_view const& left_table, ineq_op); } - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return merged_indices; } diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 280a42d9e20..171b81152ff 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -29,7 +29,7 @@ CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type locati #if defined(__CUDA_ARCH__) result = devarr[location]; #else - CUDF_CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost)); #endif return static_cast(result); } diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 454a8c9d694..61f728447e8 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -216,7 +216,7 @@ struct minmax_functor { // copy the minmax_pair to the host; does not copy the strings using OutputType = minmax_pair; OutputType host_result; - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value())); // strings are copied to create the scalars here return {std::make_unique(host_result.min_val, true, stream, mr), @@ -235,7 +235,7 @@ struct minmax_functor { // copy the minmax_pair to the host to call get_element using OutputType = minmax_pair; OutputType host_result; - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value())); // get the keys for those indexes auto const keys = dictionary_column_view(col).keys(); diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu index 885d7e904b4..3b8cc17c4aa 100644 --- a/cpp/src/reductions/scan/scan_exclusive.cu +++ b/cpp/src/reductions/scan/scan_exclusive.cu @@ -67,7 +67,7 @@ struct scan_dispatcher { thrust::exclusive_scan( rmm::exec_policy(stream), begin, begin + input.size(), output.data(), identity, Op{}); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return output_column; } diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu index 5ffdf1f5c56..9d07f340ebf 100644 --- a/cpp/src/reductions/scan/scan_inclusive.cu +++ b/cpp/src/reductions/scan/scan_inclusive.cu @@ -120,7 +120,7 @@ struct scan_functor { thrust::inclusive_scan( rmm::exec_policy(stream), begin, begin + input_view.size(), result.data(), Op{}); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return output_column; } }; diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index ca07d60f426..d704b18774f 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1281,7 +1281,7 @@ std::unique_ptr rolling_window_udf(column_view const& input, output->set_null_count(output->size() - device_valid_count.value(stream)); // check the stream for debugging - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return output; } diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 19bb60ef1a8..76ec171052a 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -114,7 +114,7 @@ std::string string_scalar::to_string(rmm::cuda_stream_view stream) const { std::string result; result.resize(_data.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( &result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); return result; diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 29eddf703df..477666d93ae 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -97,8 +97,7 @@ std::unique_ptr search_ordered(table_view const& t, // Handle empty inputs if (t.num_rows() == 0) { - CUDF_CUDA_TRY( - cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value())); + CUDA_TRY(cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value())); return result; } diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 6a90a605ca3..adfd24f1ca2 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -87,11 +87,11 @@ std::unique_ptr join_strings(strings_column_view const& strings, auto offsets_view = offsets_column->mutable_view(); // set the first entry to 0 and the last entry to bytes int32_t new_offsets[] = {0, static_cast(bytes)}; - CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data(), - new_offsets, - sizeof(new_offsets), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(offsets_view.data(), + new_offsets, + sizeof(new_offsets), + cudaMemcpyHostToDevice, + stream.value())); // build null mask // only one entry so it is either all valid or all null diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 1a423ef8eec..ac3c4df6aeb 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -153,11 +153,11 @@ struct format_compiler { // create program in device memory d_items.resize(items.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(d_items.data(), + items.data(), + items.size() * sizeof(items[0]), + cudaMemcpyHostToDevice, + stream.value())); } format_item const* compiled_format_items() { return d_items.data(); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index fedb8d38a08..9fa033e9f9a 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -297,7 +297,7 @@ std::unique_ptr concatenate(host_span columns, cudf::detail::get_value(offsets_child, column_size + column_offset, stream) - bytes_offset; - CUDF_CUDA_TRY( + CUDA_TRY( cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value())); // get ready for the next column diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 3bcf55cf069..b286812226b 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -163,7 +163,7 @@ std::unique_ptr> reprog_devic } // copy flat prog to device memory - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value())); // auto deleter = [d_buffer, d_relists](reprog_device* t) { diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index d496b46bc36..c0673a5e2b5 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -81,10 +81,10 @@ auto generate_empty_output(strings_column_view const& input, auto offsets_column = make_numeric_column( data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), - 0, - offsets_column->size() * sizeof(offset_type), - stream.value())); + CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), + 0, + offsets_column->size() * sizeof(offset_type), + stream.value())); return make_strings_column(strings_count, std::move(offsets_column), @@ -264,7 +264,7 @@ auto make_strings_children(Func fn, } else { // Compute the offsets values from the provided output string sizes. auto const string_sizes = output_strings_sizes.value(); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value())); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value())); thrust::inclusive_scan(rmm::exec_policy(stream), string_sizes.template begin(), string_sizes.template end(), diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index d7cc72fdfff..825f09c66e6 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -130,9 +130,9 @@ const character_flags_table_type* get_character_flags_table() { return d_character_codepoint_flags.find_or_initialize([&](void) { character_flags_table_type* table = nullptr; - CUDF_CUDA_TRY(cudaMemcpyToSymbol( + CUDA_TRY(cudaMemcpyToSymbol( character_codepoint_flags, g_character_codepoint_flags, sizeof(g_character_codepoint_flags))); - CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags)); + CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags)); return table; }); } @@ -144,9 +144,9 @@ const character_cases_table_type* get_character_cases_table() { return d_character_cases_table.find_or_initialize([&](void) { character_cases_table_type* table = nullptr; - CUDF_CUDA_TRY(cudaMemcpyToSymbol( + CUDA_TRY(cudaMemcpyToSymbol( character_cases_table, g_character_cases_table, sizeof(g_character_cases_table))); - CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table)); + CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table)); return table; }); } @@ -158,9 +158,9 @@ const special_case_mapping* get_special_case_mapping_table() { return d_special_case_mappings.find_or_initialize([&](void) { special_case_mapping* table = nullptr; - CUDF_CUDA_TRY(cudaMemcpyToSymbol( + CUDA_TRY(cudaMemcpyToSymbol( character_special_case_mappings, g_special_case_mappings, sizeof(g_special_case_mappings))); - CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings)); + CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings)); return table; }); } diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index b69d735f612..6ec364cc048 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -231,7 +231,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2; rmm::device_uvector offsets(n_upper, stream); auto d_offsets = offsets.data(); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value())); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value())); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 00094f2de71..9ab769f9edd 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -52,12 +52,12 @@ rmm::device_uvector get_codepoint_metadata(rmm::cuda_st table + cp_section1_end, table + codepoint_metadata_size, codepoint_metadata_default_value); - CUDF_CUDA_TRY(cudaMemcpyAsync(table, - codepoint_metadata, - cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync(table, + codepoint_metadata, + cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( table + cp_section2_begin, cp_metadata_917505_917999, (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section @@ -80,24 +80,24 @@ rmm::device_uvector get_aux_codepoint_data(rmm::cuda_st table + aux_section1_end, table + aux_codepoint_data_size, aux_codepoint_default_value); - CUDF_CUDA_TRY(cudaMemcpyAsync(table, - aux_codepoint_data, - aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync(table, + aux_codepoint_data, + aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( table + aux_section2_begin, aux_cp_data_44032_55203, (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section cudaMemcpyHostToDevice, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( table + aux_section3_begin, aux_cp_data_70475_71099, (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section cudaMemcpyHostToDevice, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( table + aux_section4_begin, aux_cp_data_119134_119232, (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section @@ -236,33 +236,33 @@ std::unique_ptr load_vocabulary_file( cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data(), - table.data(), - table.size() * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data(), + table.data(), + table.size() * sizeof(uint64_t), + cudaMemcpyHostToDevice, + stream.value())); result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64}, bin_coefficients.size(), cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data(), - bin_coefficients.data(), - bin_coefficients.size() * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data(), + bin_coefficients.data(), + bin_coefficients.size() * sizeof(uint64_t), + cudaMemcpyHostToDevice, + stream.value())); result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16}, bin_offsets.size(), cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data(), - bin_offsets.data(), - bin_offsets.size() * sizeof(uint16_t), - cudaMemcpyHostToDevice, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data(), + bin_offsets.data(), + bin_offsets.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice, + stream.value())); auto cp_metadata = detail::get_codepoint_metadata(stream); auto const cp_metadata_size = static_cast(cp_metadata.size()); diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 7d8df583039..82bb50c6aaa 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -457,7 +457,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre num_code_points, device_token_ids.data(), device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); cudf::detail::grid_1d const grid_mark{static_cast(num_strings + 1), THREADS_PER_BLOCK}; @@ -469,7 +469,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre device_start_word_indices, device_end_word_indices, num_strings); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); // check for special tokens and adjust indices thrust::for_each_n( @@ -512,7 +512,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre num_words, device_token_ids.data(), device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); // Repurpose the input array for the token ids. In the worst case, each code point ends up being a // token so this will always have enough memory to store the contiguous tokens. diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 74433af9f05..bc3678380be 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -102,9 +102,9 @@ std::unique_ptr compute_column(table_view const& table, // Configure kernel parameters auto const& device_expression_data = parser.device_expression_data; int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; - CUDF_CUDA_TRY( + CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); auto constexpr MAX_BLOCK_SIZE = 128; auto const block_size = @@ -125,7 +125,7 @@ std::unique_ptr compute_column(table_view const& table, <<>>( *table_device, device_expression_data, *mutable_output_device); } - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return output_column; } diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 744cec90fd9..0f06be0149e 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -505,9 +505,9 @@ std::unique_ptr row_bit_count(table_view const& t, // of memory of size (# input rows * sizeof(row_span) * max_branch_depth). auto const shmem_per_thread = sizeof(row_span) * h_info.max_branch_depth; int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; - CUDF_CUDA_TRY( + CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); constexpr int max_block_size = 256; auto const block_size = diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh index 08b68cc0591..19d78b010ec 100644 --- a/cpp/src/unary/unary_ops.cuh +++ b/cpp/src/unary/unary_ops.cuh @@ -70,7 +70,7 @@ struct launcher { thrust::transform( rmm::exec_policy(stream), input.begin(), input.end(), output_view.begin(), F{}); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return output; } diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 6c2c0716331..3a479f0860b 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,10 +88,10 @@ rmm::device_uvector make_mask(cudf::size_type size, bool fil return cudf::detail::make_zeroed_device_uvector_sync(size); } else { auto ret = rmm::device_uvector(size, rmm::cuda_stream_default); - CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), - ~cudf::bitmask_type{0}, - size * sizeof(cudf::bitmask_type), - rmm::cuda_stream_default.value())); + CUDA_TRY(cudaMemsetAsync(ret.data(), + ~cudf::bitmask_type{0}, + size * sizeof(cudf::bitmask_type), + rmm::cuda_stream_default.value())); return ret; } } @@ -530,10 +530,10 @@ void cleanEndWord(rmm::device_buffer& mask, int begin_bit, int end_bit) auto number_of_bits = end_bit - begin_bit; if (number_of_bits % 32 != 0) { cudf::bitmask_type end_mask = 0; - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( &end_mask, ptr + number_of_mask_words - 1, sizeof(end_mask), cudaMemcpyDeviceToHost)); end_mask = end_mask & ((1 << (number_of_bits % 32)) - 1); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( ptr + number_of_mask_words - 1, &end_mask, sizeof(end_mask), cudaMemcpyHostToDevice)); } } diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index 4d76008fd13..93e4e588e0e 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -62,9 +62,9 @@ struct TypedColumnTest : public cudf::test::BaseFixture { std::iota(h_data.begin(), h_data.end(), char{0}); std::vector h_mask(mask.size()); std::iota(h_mask.begin(), h_mask.end(), char{0}); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( typed_data, h_data.data(), data.size(), cudaMemcpyHostToDevice, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( typed_mask, h_mask.data(), mask.size(), cudaMemcpyHostToDevice, stream.value())); stream.synchronize(); } diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 1067366d010..581268f26f4 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -155,7 +155,7 @@ struct AtomicsTest : public cudf::test::BaseFixture { auto host_result = cudf::detail::make_host_vector_sync(dev_result); - CUDF_CHECK_CUDA(rmm::cuda_stream_default.value()); + CHECK_CUDA(rmm::cuda_stream_default.value()); if (!is_timestamp_sum()) { EXPECT_EQ(host_result[0], exact[0]) << "atomicAdd test failed"; @@ -302,7 +302,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture { auto host_result = cudf::detail::make_host_vector_sync(dev_result); - CUDF_CHECK_CUDA(rmm::cuda_stream_default.value()); + CHECK_CUDA(rmm::cuda_stream_default.value()); // print_exact(exact, "exact"); // print_exact(host_result.data(), "result"); diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 4327a8b694b..da9509e94a6 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,28 +36,28 @@ TEST(ExpectsTest, TryCatch) TEST(CudaTryTest, Error) { - CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure), + CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorLaunchFailure), "cudaErrorLaunchFailure unspecified launch failure"); } -TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); } +TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDA_TRY(cudaSuccess)); } TEST(CudaTryTest, TryCatch) { - CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation), + CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorMemoryAllocation), "cudaErrorMemoryAllocation out of memory"); } -TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); } +TEST(StreamCheck, success) { EXPECT_NO_THROW(CHECK_CUDA(0)); } namespace { // Some silly kernel that will cause an error void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; } } // namespace -// In a release build and without explicit synchronization, CUDF_CHECK_CUDA may +// In a release build and without explicit synchronization, CHECK_CUDA may // or may not fail on erroneous asynchronous CUDA calls. Invoke // cudaStreamSynchronize to guarantee failure on error. In a non-release build, -// CUDF_CHECK_CUDA deterministically fails on erroneous asynchronous CUDA +// CHECK_CUDA deterministically fails on erroneous asynchronous CUDA // calls. TEST(StreamCheck, FailedKernel) { @@ -67,7 +67,7 @@ TEST(StreamCheck, FailedKernel) #ifdef NDEBUG stream.synchronize(); #endif - EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error); + EXPECT_THROW(CHECK_CUDA(stream.value()), cudf::cuda_error); } TEST(StreamCheck, CatchFailedKernel) @@ -78,7 +78,7 @@ TEST(StreamCheck, CatchFailedKernel) #ifndef NDEBUG stream.synchronize(); #endif - CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()), + CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream.value()), "cudaErrorInvalidConfiguration " "invalid configuration argument"); } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index cd0aab3caeb..1f4a8a7e508 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -1101,11 +1101,11 @@ class custom_test_data_sink : public cudf::io::data_sink { { return std::async(std::launch::deferred, [=] { char* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); + CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); outfile_.write(ptr, size); - CUDF_CUDA_TRY(cudaFreeHost(ptr)); + CUDA_TRY(cudaFreeHost(ptr)); }); } @@ -2166,11 +2166,11 @@ class custom_test_memmap_sink : public cudf::io::data_sink { { return std::async(std::launch::deferred, [=] { char* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); + CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); mm_writer->host_write(ptr, size); - CUDF_CUDA_TRY(cudaFreeHost(ptr)); + CUDA_TRY(cudaFreeHost(ptr)); }); } diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index f560ce7f20c..57041e448a2 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1423,7 +1423,7 @@ TEST_F(JoinTest, HashJoinLargeOutputSize) // self-join a table of zeroes to generate an output row count that would overflow int32_t std::size_t col_size = 65567; rmm::device_buffer zeroes(col_size * sizeof(int32_t), rmm::cuda_stream_default); - CUDF_CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value())); + CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value())); cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data()); cudf::table_view tview{{col_zeros}}; cudf::hash_join hash_join(tview, cudf::null_equality::UNEQUAL); diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu index 30c843a91c4..ee4c878726f 100644 --- a/cpp/tests/scalar/scalar_device_view_test.cu +++ b/cpp/tests/scalar/scalar_device_view_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,13 +60,13 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value) rmm::device_scalar result{rmm::cuda_stream_default}; test_set_value<<<1, 1>>>(scalar_device_view, scalar_device_view1); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); EXPECT_EQ(s1.value(), value); EXPECT_TRUE(s1.is_valid()); test_value<<<1, 1>>>(scalar_device_view, scalar_device_view1, result.data()); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); EXPECT_TRUE(result.value(rmm::cuda_stream_default)); } @@ -85,7 +85,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull) rmm::device_scalar result{rmm::cuda_stream_default}; test_null<<<1, 1>>>(scalar_device_view, result.data()); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); EXPECT_FALSE(result.value(rmm::cuda_stream_default)); } @@ -105,7 +105,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull) EXPECT_TRUE(s.is_valid()); test_setnull<<<1, 1>>>(scalar_device_view); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); EXPECT_FALSE(s.is_valid()); } @@ -131,7 +131,7 @@ TEST_F(StringScalarDeviceViewTest, Value) auto value_v = cudf::detail::make_device_uvector_sync(value); test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data()); - CUDF_CHECK_CUDA(0); + CHECK_CUDA(0); EXPECT_TRUE(result.value(rmm::cuda_stream_default)); } diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 1dd7e21b821..f79c77ce429 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -86,7 +86,7 @@ TYPED_TEST(Sort, WithNullMax) // the rest of the values are equivalent and yields random sorted order. auto to_host = [](column_view const& col) { thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); return h_data; }; @@ -124,7 +124,7 @@ TYPED_TEST(Sort, WithNullMin) // the rest of the values are equivalent and yields random sorted order. auto to_host = [](column_view const& col) { thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); return h_data; }; @@ -160,7 +160,7 @@ TYPED_TEST(Sort, WithMixedNullOrder) // the rest of the values are equivalent and yields random sorted order. auto to_host = [](column_view const& col) { thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); return h_data; }; diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp index ee43c9e7b4b..b6b7495136e 100644 --- a/cpp/tests/sort/stable_sort_tests.cpp +++ b/cpp/tests/sort/stable_sort_tests.cpp @@ -94,7 +94,7 @@ TYPED_TEST(StableSort, WithNullMax) // the rest of the values are equivalent and yields random sorted order. auto to_host = [](column_view const& col) { thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); return h_data; }; @@ -130,7 +130,7 @@ TYPED_TEST(StableSort, WithNullMin) // the rest of the values are equivalent and yields random sorted order. auto to_host = [](column_view const& col) { thrust::host_vector h_data(col.size()); - CUDF_CUDA_TRY(cudaMemcpy( + CUDA_TRY(cudaMemcpy( h_data.data(), col.data(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault)); return h_data; }; diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 6861737bfb5..0ba4b268c70 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -78,7 +78,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) h_offsets[idx + 1] = offset; } auto d_strings = cudf::detail::make_device_uvector_sync(strings); - CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice)); auto column = cudf::make_strings_column(d_strings); EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING}); EXPECT_EQ(column->null_count(), nulls); diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 5802a1ddc0a..7f8a31ef9bb 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -302,10 +302,10 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger) auto integers = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, (cudf::size_type)d_integers.size()); auto integers_view = integers->mutable_view(); - CUDF_CUDA_TRY(cudaMemcpy(integers_view.data(), - d_integers.data(), - d_integers.size() * sizeof(TypeParam), - cudaMemcpyDeviceToDevice)); + CUDA_TRY(cudaMemcpy(integers_view.data(), + d_integers.data(), + d_integers.size() * sizeof(TypeParam), + cudaMemcpyDeviceToDevice)); integers_view.set_null_count(0); // convert to strings diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu index d8b2a736bde..dca80b597c0 100644 --- a/cpp/tests/types/type_dispatcher_test.cu +++ b/cpp/tests/types/type_dispatcher_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,7 +71,7 @@ TYPED_TEST(TypedDispatcherTest, DeviceDispatch) { auto result = cudf::detail::make_zeroed_device_uvector_sync(1); dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id(), result.data()); - CUDF_CUDA_TRY(cudaDeviceSynchronize()); + CUDA_TRY(cudaDeviceSynchronize()); EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default)); } @@ -132,7 +132,7 @@ TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch) auto result = cudf::detail::make_zeroed_device_uvector_sync(1); double_dispatch_test_kernel<<<1, 1>>>( cudf::type_to_id(), cudf::type_to_id(), result.data()); - CUDF_CUDA_TRY(cudaDeviceSynchronize()); + CUDA_TRY(cudaDeviceSynchronize()); EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default)); } diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 015178f8c7c..68626c2d4d3 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -824,16 +824,16 @@ std::vector bitmask_to_host(cudf::column_view const& c) auto num_bitmasks = num_bitmask_words(c.size()); std::vector host_bitmask(num_bitmasks); if (c.offset() == 0) { - CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(), - c.null_mask(), - num_bitmasks * sizeof(bitmask_type), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(host_bitmask.data(), + c.null_mask(), + num_bitmasks * sizeof(bitmask_type), + cudaMemcpyDeviceToHost)); } else { auto mask = copy_bitmask(c.null_mask(), c.offset(), c.offset() + c.size()); - CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(), - mask.data(), - num_bitmasks * sizeof(bitmask_type), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(host_bitmask.data(), + mask.data(), + num_bitmasks * sizeof(bitmask_type), + cudaMemcpyDeviceToHost)); } return host_bitmask; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index cebe476dd87..78ac8a18107 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -137,8 +137,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink { left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; char *copy_to = current_buffer_data + current_buffer_written; - CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, - stream.value())); + CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, + stream.value())); copy_from = copy_from + amount_to_copy; current_buffer_written += amount_to_copy; diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu index 13d1a5a94a9..683651799e7 100644 --- a/java/src/main/native/src/map_lookup.cu +++ b/java/src/main/native/src/map_lookup.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,7 +123,7 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke gpu_find_first<<>>( *input_device_view, *output_view, lookup_key_device_view); - CUDF_CHECK_CUDA(stream.value()); + CHECK_CUDA(stream.value()); return gather_map; } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 96ee95c476d..4d78f416134 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -1766,9 +1766,9 @@ std::vector> convert_to_rows( std::optional> variable_width_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; - CUDF_CUDA_TRY( + CUDA_TRY( cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); #ifndef __CUDA_ARCH__ // __host__ code. @@ -2097,9 +2097,9 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, auto const num_rows = input.parent().size(); int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; - CUDF_CUDA_TRY( + CUDA_TRY( cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); #ifndef __CUDA_ARCH__ // __host__ code.