Skip to content

Commit

Permalink
Merge branch 'branch-24.08' into pq_writer_opts_refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
hyperbolic2346 authored Jun 7, 2024
2 parents 6bde2e7 + 8e40fe7 commit 9ca4c45
Show file tree
Hide file tree
Showing 17 changed files with 121 additions and 345 deletions.
1 change: 0 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,6 @@ add_library(
src/io/utilities/data_sink.cpp
src/io/utilities/datasource.cpp
src/io/utilities/file_io_utilities.cpp
src/io/utilities/parsing_utils.cu
src/io/utilities/row_selection.cpp
src/io/utilities/type_inference.cu
src/io/utilities/trie.cu
Expand Down
221 changes: 0 additions & 221 deletions cpp/src/io/utilities/parsing_utils.cu

This file was deleted.

76 changes: 0 additions & 76 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*

} // namespace gpu

/**
* @brief Searches the input character array for each of characters in a set.
* Sums up the number of occurrences. If the 'positions' parameter is not void*,
* positions of all occurrences are stored in the output device array.
*
* @param[in] d_data Input character array in device memory
* @param[in] keys Vector containing the keys to count in the buffer
* @param[in] result_offset Offset to add to the output positions
* @param[out] positions Array containing the output positions
* @param[in] stream CUDA stream used for device memory operations and kernel launches
*
* @return cudf::size_type total number of occurrences
*/
template <class T>
cudf::size_type find_all_from_set(device_span<char const> data,
std::vector<char> const& keys,
uint64_t result_offset,
T* positions,
rmm::cuda_stream_view stream);

/**
* @brief Searches the input character array for each of characters in a set.
* Sums up the number of occurrences. If the 'positions' parameter is not void*,
* positions of all occurrences are stored in the output device array.
*
* Does not load the entire file into the GPU memory at any time, so it can
* be used to parse large files. Output array needs to be preallocated.
*
* @param[in] h_data Pointer to the input character array
* @param[in] h_size Number of bytes in the input array
* @param[in] keys Vector containing the keys to count in the buffer
* @param[in] result_offset Offset to add to the output positions
* @param[out] positions Array containing the output positions
* @param[in] stream CUDA stream used for device memory operations and kernel launches
*
* @return cudf::size_type total number of occurrences
*/
template <class T>
cudf::size_type find_all_from_set(host_span<char const> data,
std::vector<char> const& keys,
uint64_t result_offset,
T* positions,
rmm::cuda_stream_view stream);

/**
* @brief Searches the input character array for each of characters in a set
* and sums up the number of occurrences.
*
* @param d_data Input data buffer in device memory
* @param keys Vector containing the keys to count in the buffer
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return cudf::size_type total number of occurrences
*/
cudf::size_type count_all_from_set(device_span<char const> data,
std::vector<char> const& keys,
rmm::cuda_stream_view stream);

/**
* @brief Searches the input character array for each of characters in a set
* and sums up the number of occurrences.
*
* Does not load the entire buffer into the GPU memory at any time, so it can
* be used with buffers of any size.
*
* @param h_data Pointer to the data in host memory
* @param h_size Size of the input data, in bytes
* @param keys Vector containing the keys to count in the buffer
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return cudf::size_type total number of occurrences
*/
cudf::size_type count_all_from_set(host_span<char const> data,
std::vector<char> const& keys,
rmm::cuda_stream_view stream);

/**
* @brief Checks whether the given character is a whitespace character.
*
Expand Down
43 changes: 41 additions & 2 deletions cpp/src/unary/cast_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/binaryop.hpp>
#include <cudf/detail/fill.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/unary.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar_factories.hpp>
Expand Down Expand Up @@ -219,6 +221,28 @@ std::unique_ptr<column> rescale(column_view input,
}
};

/**
* @brief Check if a floating point value is convertible to fixed point type.
*
* A floating point value is convertible if it is not null, not `NaN`, and not `inf`.
*
* Note that convertible input values may be out of the representable range of the target fixed
* point type. Values out of the representable range need to be checked separately.
*/
template <typename FloatType>
struct is_convertible_floating_point {
column_device_view d_input;

bool __device__ operator()(size_type idx) const
{
static_assert(std::is_floating_point_v<FloatType>);

if (d_input.is_null(idx)) { return false; }
auto const value = d_input.element<FloatType>(idx);
return std::isfinite(value);
}
};

template <typename _SourceT>
struct dispatch_unary_cast_to {
column_view input;
Expand Down Expand Up @@ -294,8 +318,8 @@ struct dispatch_unary_cast_to {
std::make_unique<column>(type,
size,
rmm::device_buffer{size * cudf::size_of(type), stream, mr},
detail::copy_bitmask(input, stream, mr),
input.null_count());
rmm::device_buffer{},
0);

mutable_column_view output_mutable = *output;

Expand All @@ -308,6 +332,21 @@ struct dispatch_unary_cast_to {
output_mutable.begin<DeviceT>(),
fixed_point_unary_cast<SourceT, TargetT>{scale});

if constexpr (cudf::is_floating_point<SourceT>()) {
// For floating-point values, beside input nulls, we also need to set nulls for the output
// rows corresponding to NaN and inf in the input.
auto const d_input_ptr = column_device_view::create(input, stream);
auto [null_mask, null_count] =
cudf::detail::valid_if(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(size),
is_convertible_floating_point<SourceT>{*d_input_ptr},
stream,
mr);
if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); }
} else {
output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
}

return output;
}

Expand Down
Loading

0 comments on commit 9ca4c45

Please sign in to comment.