Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix timestamp truncation/overflow bugs in orc/parquet #9382

Merged
merged 10 commits into from
Oct 7, 2021
6 changes: 3 additions & 3 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ struct ColumnDesc {
uint32_t rowgroup_id; // row group position
ColumnEncodingKind encoding_kind; // column encoding kind
TypeKind type_kind; // column data type
uint8_t dtype_len; // data type length (for types that can be mapped to different sizes)
int32_t decimal_scale; // number of fractional decimal digits for decimal type
int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
uint8_t dtype_len; // data type length (for types that can be mapped to different sizes)
int32_t decimal_scale; // number of fractional decimal digits for decimal type
type_id timestamp_type_id; // output timestamp type id (type_id::EMPTY by default)
column_validity_info parent_validity_info; // consists of parent column valid_map and null count
uint32_t* parent_null_count_prefix_sums; // per-stripe prefix sums of parent column's null count
};
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1411,7 +1411,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
: cudf::size_of(column_types[col_idx]);
chunk.num_rowgroups = stripe_num_rowgroups;
if (chunk.type_kind == orc::TIMESTAMP) {
chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
chunk.timestamp_type_id = _timestamp_type.id();
}
if (not is_data_empty) {
for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
Expand Down
34 changes: 23 additions & 11 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1789,17 +1789,29 @@ __global__ void __launch_bounds__(block_size)
seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
}
if (seconds < 0 && nanos != 0) { seconds -= 1; }
if (s->chunk.ts_clock_rate) {
duration_ns d_ns{nanos};
d_ns += duration_s{seconds};
static_cast<int64_t*>(data_out)[row] =
d_ns.count() * s->chunk.ts_clock_rate /
duration_ns::period::den; // Output to desired clock rate
} else {
cudf::duration_s d{seconds};
static_cast<int64_t*>(data_out)[row] =
cuda::std::chrono::duration_cast<cudf::duration_ns>(d).count() + nanos;
}

duration_ns d_ns{nanos};
duration_s d_s{seconds};

static_cast<int64_t*>(data_out)[row] = [&]() {
using cuda::std::chrono::duration_cast;
switch (s->chunk.timestamp_type_id) {
case type_id::TIMESTAMP_SECONDS:
return d_s.count() + duration_cast<duration_s>(d_ns).count();
case type_id::TIMESTAMP_MILLISECONDS:
return duration_cast<duration_ms>(d_s).count() +
duration_cast<duration_ms>(d_ns).count();
case type_id::TIMESTAMP_MICROSECONDS:
return duration_cast<duration_us>(d_s).count() +
duration_cast<duration_us>(d_ns).count();
case type_id::TIMESTAMP_NANOSECONDS:
default:
return duration_cast<duration_ns>(d_s).count() +
d_ns.count(); // nanoseconds as output in case of `type_id::EMPTY` and
// `type_id::TIMESTAMP_NANOSECONDS`
}
}();

break;
}
}
Expand Down
25 changes: 17 additions & 8 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -663,15 +663,24 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src
nanos |= v.x;
// Convert from Julian day at noon to UTC seconds
days = static_cast<int32_t>(v.z);
cudf::duration_D d{
cudf::duration_D d_d{
days - 2440588}; // TBD: Should be noon instead of midnight, but this matches pyarrow
if (s->col.ts_clock_rate) {
int64_t secs = duration_cast<cudf::duration_s>(d).count() +
duration_cast<cudf::duration_s>(cudf::duration_ns{nanos}).count();
ts = secs * s->col.ts_clock_rate; // Output to desired clock rate
} else {
ts = duration_cast<cudf::duration_ns>(d).count() + nanos;
}

ts = [&]() {
switch (s->col.ts_clock_rate) {
case 1: // seconds
return duration_cast<duration_s>(d_d).count() +
duration_cast<duration_s>(duration_ns{nanos}).count();
case 1'000: // milliseconds
return duration_cast<duration_ms>(d_d).count() +
duration_cast<duration_ms>(duration_ns{nanos}).count();
case 1'000'000: // microseconds
return duration_cast<duration_us>(d_d).count() +
duration_cast<duration_us>(duration_ns{nanos}).count();
case 1'000'000'000: // nanoseconds
default: return duration_cast<cudf::duration_ns>(d_d).count() + nanos;
}
}();
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
} else {
ts = 0;
}
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,29 @@ TEST_F(OrcReaderTest, MultipleInputs)
CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
}

TEST_F(OrcReaderTest, SimpleTimestamps)
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
{
auto int_data = random_values<int64_t>(5);
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });

column_wrapper<int64_t> const intcol{int_data.begin(), int_data.end(), validity};
auto tscol = cudf::bit_cast(intcol, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
table_view expected({tscol});

auto filepath = temp_env->get_temp_filepath("OrcSimpleTimestamps.orc");
cudf_io::orc_writer_options out_opts =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
cudf_io::write_orc(out_opts);

cudf_io::orc_reader_options in_opts =
cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
.use_index(false)
.timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
auto result = cudf_io::read_orc(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
}

struct OrcWriterTestDecimal : public OrcWriterTest,
public ::testing::WithParamInterface<std::tuple<int, int>> {
};
Expand Down