Skip to content

Commit

Permalink
Enable strict_decimal_types in parquet reading(#6969)
Browse files Browse the repository at this point in the history
This pull request is to address #6909.

Authors:
  - sperlingxx <[email protected]>
  - Alfred Xu <[email protected]>

Approvers:
  - Robert (Bobby) Evans
  - Mike Wilson
  - Devavret Makkar

URL: #6969
  • Loading branch information
sperlingxx authored Dec 15, 2020
1 parent 515a173 commit 6d1b076
Show file tree
Hide file tree
Showing 5 changed files with 402 additions and 207 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- PR #6275 Update to official libcu++ on Github
- PR #6838 Fix `columns` & `index` handling in dataframe constructor
- PR #6750 Remove **kwargs from string/categorical methods
- PR #6909 Support reading byte array backed decimal columns from parquet files
- PR #6939 Use simplified `rmm::exec_policy`
- PR #6982 Disable some pragma unroll statements in thrust `sort.h`

Expand Down
42 changes: 41 additions & 1 deletion cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,39 @@ inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s,
*dst = (scale < 0) ? (d * kPow10[min(-scale, 39)]) : (d / kPow10[min(scale, 39)]);
}

/**
* @brief Output a fixed-length byte array(len <= 8) as a 64-bit int
*
* @param[in,out] s Page state input/output
* @param[in] src_pos Source position
* @param[in] dst Pointer to row output data
*/
inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s *s,
int src_pos,
int64_t *dst)
{
uint32_t const dtype_len_in = s->dtype_len_in;
uint8_t const *data = s->dict_base ? s->dict_base : s->data_start;
uint32_t const pos =
(s->dict_base ? ((s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] : 0)
: src_pos) *
dtype_len_in;
uint32_t const dict_size = s->dict_size;

int64_t unscaled64 = 0;
for (unsigned int i = 0; i < dtype_len_in; i++) {
uint32_t v = (pos + i < dict_size) ? data[pos + i] : 0;
unscaled64 = (unscaled64 << 8) | v;
}
// Shift the unscaled value up and back down when it isn't all 8 bytes,
// which sign extend the value for correctly representing negative numbers.
if (dtype_len_in < 8) {
unscaled64 <<= 64 - dtype_len_in * 8;
unscaled64 >>= 64 - dtype_len_in * 8;
}
*dst = unscaled64;
}

/**
* @brief Output a small fixed-length value
*
Expand Down Expand Up @@ -1690,7 +1723,14 @@ extern "C" __global__ void __launch_bounds__(block_size)
switch (dtype) {
case INT32: gpuOutputFast(s, src_pos, static_cast<uint32_t *>(dst)); break;
case INT64: gpuOutputFast(s, src_pos, static_cast<uint2 *>(dst)); break;
default: gpuOutputDecimalAsFloat(s, src_pos, static_cast<double *>(dst), dtype); break;
default:
// we currently do not support reading byte arrays larger than DECIMAL64
if (s->dtype_len_in <= 8) {
gpuOutputFixedLenByteArrayAsInt64(s, src_pos, static_cast<int64_t *>(dst));
} else {
gpuOutputDecimalAsFloat(s, src_pos, static_cast<double *>(dst), dtype);
}
break;
}
} else if (dtype == INT96)
gpuOutputInt96Timestamp(s, src_pos, static_cast<int64_t *>(dst));
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ type_id to_type_id(SchemaElement const &schema,
return type_id::DECIMAL32;
else if (physical == parquet::INT64)
return type_id::DECIMAL64;
else {
else if (physical == parquet::FIXED_LEN_BYTE_ARRAY && schema.type_length <= 8) {
return type_id::DECIMAL64;
} else {
CUDF_EXPECTS(strict_decimal_types == false, "Unsupported decimal type read!");
return type_id::FLOAT64;
}
Expand Down
Loading

0 comments on commit 6d1b076

Please sign in to comment.