Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for Decimal/Fixed-point to ORC reader #7970

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 0 additions & 54 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,6 @@ class orc_reader_options {
// Cast timestamp columns to a specific type
data_type _timestamp_type{type_id::EMPTY};

// Whether to convert decimals to float64
bool _decimals_as_float64 = true;
// For decimals as int, optional forced decimal scale;
// -1 is auto (column scale), >=0: number of fractional digits
size_type _forced_decimals_scale = -1;

friend orc_reader_options_builder;

/**
Expand Down Expand Up @@ -134,16 +128,6 @@ class orc_reader_options {
*/
data_type get_timestamp_type() const { return _timestamp_type; }

/**
* @brief Whether to convert decimals to float64.
*/
bool is_enabled_decimals_as_float64() const { return _decimals_as_float64; }

/**
* @brief Returns whether decimal scale is inferred or forced to have limited fractional digits.
*/
size_type get_forced_decimals_scale() const { return _forced_decimals_scale; }

// Setters

/**
Expand Down Expand Up @@ -207,20 +191,6 @@ class orc_reader_options {
* @param type Type of timestamp.
*/
void set_timestamp_type(data_type type) { _timestamp_type = type; }

/**
* @brief Enable/Disable conversion of decimals to float64.
*
* @param val Boolean value to enable/disable.
*/
void set_decimals_as_float64(bool val) { _decimals_as_float64 = val; }

/**
* @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
*
* @param val Length of fractional digits.
*/
void set_forced_decimals_scale(size_type val) { _forced_decimals_scale = val; }
};

class orc_reader_options_builder {
Expand Down Expand Up @@ -325,30 +295,6 @@ class orc_reader_options_builder {
return *this;
}

/**
* @brief Enable/Disable conversion of decimals to float64.
*
* @param val Boolean value to enable/disable.
* @return this for chaining.
*/
orc_reader_options_builder& decimals_as_float64(bool val)
{
options._decimals_as_float64 = val;
return *this;
}

/**
* @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
*
* @param val Length of fractional digits.
* @return this for chaining.
*/
orc_reader_options_builder& forced_decimals_scale(size_type val)
{
options._forced_decimals_scale = val;
return *this;
}

/**
* @brief move orc_reader_options member once it's built.
*/
Expand Down
7 changes: 1 addition & 6 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -83,11 +83,6 @@ struct DictionaryEntry {
uint32_t len; // Length in data stream
};

/**
* @brief Mask to indicate conversion from decimals to float64
*/
constexpr int orc_decimal2float64_scale = 0x80;

/**
* @brief Struct to describe per stripe's column information
*/
Expand Down
35 changes: 12 additions & 23 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -50,8 +50,7 @@ namespace {
*/
constexpr type_id to_type_id(const orc::SchemaType &schema,
bool use_np_dtypes,
type_id timestamp_type_id,
bool decimals_as_float64)
type_id timestamp_type_id)
{
switch (schema.kind) {
case orc::BOOLEAN: return type_id::BOOL8;
Expand All @@ -73,9 +72,7 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
case orc::DATE:
// There isn't a (DAYS -> np.dtype) mapping
return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
case orc::DECIMAL:
// There isn't an arbitrary-precision type in cuDF, so map as float or int
return (decimals_as_float64) ? type_id::FLOAT64 : type_id::INT64;
case orc::DECIMAL: return type_id::DECIMAL64;
default: break;
}

Expand Down Expand Up @@ -406,10 +403,6 @@ reader::impl::impl(std::unique_ptr<datasource> source,

// Enable or disable the conversion to numpy-compatible dtypes
_use_np_dtypes = options.is_enabled_use_np_dtypes();

// Control decimals conversion (float64 or int64 with optional scale)
_decimals_as_float64 = options.is_enabled_decimals_as_float64();
_decimals_as_int_scale = options.get_forced_decimals_scale();
}

table_with_metadata reader::impl::read(size_type skip_rows,
Expand All @@ -432,10 +425,13 @@ table_with_metadata reader::impl::read(size_type skip_rows,
// Get a list of column data types
std::vector<data_type> column_types;
for (const auto &col : _selected_columns) {
auto col_type = to_type_id(
_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id(), _decimals_as_float64);
auto col_type = to_type_id(_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id());
auto scale = (col_type == type_id::DECIMAL64) ? _metadata->ff.types[col].scale : 0;
CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
column_types.emplace_back(col_type);
// Remove this once we support Decimal128 data type
CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->ff.types[col].precision <= 18),
"Decimal data has precision > 18, Decimal64 data type doesn't support it.");
column_types.emplace_back(col_type, -1 * static_cast<int32_t>(scale));
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

// Map each ORC column to its column
orc_col_map[col] = column_types.size() - 1;
Expand Down Expand Up @@ -517,16 +513,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
chunk.num_rows = stripe_info->numberOfRows;
chunk.encoding_kind = stripe_footer->columns[_selected_columns[j]].kind;
chunk.type_kind = _metadata->ff.types[_selected_columns[j]].kind;
if (_decimals_as_float64) {
chunk.decimal_scale =
_metadata->ff.types[_selected_columns[j]].scale | orc::gpu::orc_decimal2float64_scale;
} else if (_decimals_as_int_scale < 0) {
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
} else {
chunk.decimal_scale = _decimals_as_int_scale;
}
chunk.rowgroup_id = num_rowgroups;
chunk.dtype_len = (column_types[j].id() == type_id::STRING)
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
chunk.rowgroup_id = num_rowgroups;
chunk.dtype_len = (column_types[j].id() == type_id::STRING)
? sizeof(std::pair<const char *, size_t>)
: cudf::size_of(column_types[j]);
if (chunk.type_kind == orc::TIMESTAMP) {
Expand Down
8 changes: 3 additions & 5 deletions cpp/src/io/orc/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,9 @@ class reader::impl {
std::unique_ptr<cudf::io::orc::metadata> _metadata;

std::vector<int> _selected_columns;
bool _use_index = true;
bool _use_np_dtypes = true;
bool _has_timestamp_column = false;
bool _decimals_as_float64 = true;
size_type _decimals_as_int_scale = -1;
bool _use_index = true;
bool _use_np_dtypes = true;
bool _has_timestamp_column = false;
data_type _timestamp_type{type_id::EMPTY};
};

Expand Down
65 changes: 23 additions & 42 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -957,15 +957,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
return rle->num_vals;
}

/**
* @brief Powers of 10
*/
static const __device__ __constant__ double kPow10[40] = {
1.0, 1.e1, 1.e2, 1.e3, 1.e4, 1.e5, 1.e6, 1.e7, 1.e8, 1.e9, 1.e10, 1.e11, 1.e12, 1.e13,
1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39,
};

static const __device__ __constant__ int64_t kPow5i[28] = {1,
5,
25,
Expand Down Expand Up @@ -1036,39 +1027,29 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
if (t >= num_vals_read and t < num_vals_to_read) {
auto const pos = static_cast<int>(vals.i64[t]);
int128_s v = decode_varint128(bs, pos);

if (col_scale & orc_decimal2float64_scale) {
double f = Int128ToDouble_rn(v.lo, v.hi);
int32_t scale = (t < numvals) ? val_scale : 0;
if (scale >= 0)
vals.f64[t] = f / kPow10[min(scale, 39)];
else
vals.f64[t] = f * kPow10[min(-scale, 39)];
} else {
int32_t scale = (t < numvals) ? (col_scale & ~orc_decimal2float64_scale) - val_scale : 0;
if (scale >= 0) {
scale = min(scale, 27);
vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
} else // if (scale < 0)
{
bool is_negative = (v.hi < 0);
uint64_t hi = v.hi, lo = v.lo;
scale = min(-scale, 27);
if (is_negative) {
hi = (~hi) + (lo == 0);
lo = (~lo) + 1;
}
lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
hi >>= (int32_t)scale;
if (hi != 0) {
// Use intermediate float
lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
hi = 0;
} else {
lo /= kPow5i[scale];
}
vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved
if (scale >= 0) {
scale = min(scale, 27);
vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
} else // if (scale < 0)
{
bool is_negative = (v.hi < 0);
uint64_t hi = v.hi, lo = v.lo;
scale = min(-scale, 27);
if (is_negative) {
hi = (~hi) + (lo == 0);
lo = (~lo) + 1;
}
lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
hi >>= (int32_t)scale;
if (hi != 0) {
// Use intermediate float
lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
hi = 0;
} else {
lo /= kPow5i[scale];
}
vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
}
}
// There is nothing to read, so break
Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/_lib/cpp/io/orc.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ cdef extern from "cudf/io/orc.hpp" \
void enable_use_index(bool val) except+
void enable_use_np_dtypes(bool val) except+
void set_timestamp_type(data_type type) except+
void enable_decimals_as_float64(bool val) except+
void set_forced_decimals_scale(size_type scale) except+

@staticmethod
orc_reader_options_builder builder(
Expand All @@ -53,10 +51,6 @@ cdef extern from "cudf/io/orc.hpp" \
orc_reader_options_builder& use_index(bool val) except+
orc_reader_options_builder& use_np_dtypes(bool val) except+
orc_reader_options_builder& timestamp_type(data_type type) except+
orc_reader_options_builder& decimals_as_float64(bool val) except+
orc_reader_options_builder& forced_decimals_scale(
size_type scale
) except+

orc_reader_options build() except+

Expand Down
10 changes: 1 addition & 9 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ cpdef read_orc(object filepath_or_buffer,
object skip_rows=None,
object num_rows=None,
bool use_index=True,
bool decimals_as_float=True,
object force_decimal_scale=None,
object timestamp_type=None):
"""
Cython function to call into libcudf API, see `read_orc`.
Expand All @@ -96,9 +94,7 @@ cpdef read_orc(object filepath_or_buffer,
)
)
),
use_index,
decimals_as_float,
get_size_t_arg(force_decimal_scale, "force_decimal_scale")
use_index
)

cdef table_with_metadata c_result
Expand Down Expand Up @@ -173,8 +169,6 @@ cdef orc_reader_options make_orc_reader_options(
size_type num_rows,
type_id timestamp_type,
bool use_index,
bool decimals_as_float,
size_type force_decimal_scale
) except*:

cdef vector[string] c_column_names
Expand All @@ -192,8 +186,6 @@ cdef orc_reader_options make_orc_reader_options(
.num_rows(num_rows)
.timestamp_type(data_type(timestamp_type))
.use_index(use_index)
.decimals_as_float64(decimals_as_float)
.forced_decimals_scale(force_decimal_scale)
.build()
)

Expand Down
4 changes: 0 additions & 4 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,6 @@ def read_orc(
skiprows=None,
num_rows=None,
use_index=True,
decimals_as_float=True,
force_decimal_scale=None,
timestamp_type=None,
**kwargs,
):
Expand Down Expand Up @@ -266,8 +264,6 @@ def read_orc(
skiprows,
num_rows,
use_index,
decimals_as_float,
force_decimal_scale,
timestamp_type,
)
)
Expand Down
Binary file not shown.
Loading