Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix timestamp truncation/overflow bugs in orc/parquet #9382

Merged
merged 10 commits into from
Oct 7, 2021
6 changes: 3 additions & 3 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ struct ColumnDesc {
uint32_t rowgroup_id; // row group position
ColumnEncodingKind encoding_kind; // column encoding kind
TypeKind type_kind; // column data type
uint8_t dtype_len; // data type length (for types that can be mapped to different sizes)
int32_t decimal_scale; // number of fractional decimal digits for decimal type
int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
uint8_t dtype_len; // data type length (for types that can be mapped to different sizes)
int32_t decimal_scale; // number of fractional decimal digits for decimal type
type_id timestamp_type_id; // output timestamp type id (type_id::EMPTY by default)
column_validity_info parent_validity_info; // consists of parent column valid_map and null count
uint32_t* parent_null_count_prefix_sums; // per-stripe prefix sums of parent column's null count
};
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1411,7 +1411,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
: cudf::size_of(column_types[col_idx]);
chunk.num_rowgroups = stripe_num_rowgroups;
if (chunk.type_kind == orc::TIMESTAMP) {
chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
chunk.timestamp_type_id = _timestamp_type.id();
}
if (not is_data_empty) {
for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
Expand Down
32 changes: 22 additions & 10 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1789,17 +1789,29 @@ __global__ void __launch_bounds__(block_size)
seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
}
if (seconds < 0 && nanos != 0) { seconds -= 1; }
if (s->chunk.ts_clock_rate) {
duration_ns d_ns{nanos};
d_ns += duration_s{seconds};
static_cast<int64_t*>(data_out)[row] =
d_ns.count() * s->chunk.ts_clock_rate /
duration_ns::period::den; // Output to desired clock rate
} else {
cudf::duration_s d{seconds};
static_cast<int64_t*>(data_out)[row] =
cuda::std::chrono::duration_cast<cudf::duration_ns>(d).count() + nanos;

duration_ns d_ns{nanos};
d_ns += duration_s{seconds};
vuule marked this conversation as resolved.
Show resolved Hide resolved

int64_t res;
switch (s->chunk.timestamp_type_id) {
case type_id::TIMESTAMP_SECONDS: {
res = cuda::std::chrono::duration_cast<duration_s>(d_ns).count();
break;
}
case type_id::TIMESTAMP_MILLISECONDS: {
res = cuda::std::chrono::duration_cast<duration_ms>(d_ns).count();
break;
}
case type_id::TIMESTAMP_MICROSECONDS: {
res = cuda::std::chrono::duration_cast<duration_us>(d_ns).count();
break;
}
default:
res = d_ns.count(); // nanoseconds as output in case of `type_id::EMPTY` and
// `type_id::TIMESTAMP_NANOSECONDS`
}
static_cast<int64_t*>(data_out)[row] = res;
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
break;
}
}
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,29 @@ TEST_F(OrcReaderTest, MultipleInputs)
CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
}

TEST_F(OrcReaderTest, SimpleTimestamps)
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
{
auto int_data = random_values<int64_t>(5);
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });

column_wrapper<int64_t> const intcol{int_data.begin(), int_data.end(), validity};
auto tscol = cudf::bit_cast(intcol, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
table_view expected({tscol});

auto filepath = temp_env->get_temp_filepath("OrcSimpleTimestamps.orc");
cudf_io::orc_writer_options out_opts =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
cudf_io::write_orc(out_opts);

cudf_io::orc_reader_options in_opts =
cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
.use_index(false)
.timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
auto result = cudf_io::read_orc(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
}

struct OrcWriterTestDecimal : public OrcWriterTest,
public ::testing::WithParamInterface<std::tuple<int, int>> {
};
Expand Down