Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove INT96 timestamps in cuDF Parquet writer #15901

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 0 additions & 64 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,6 @@ class parquet_writer_options {
std::optional<table_input_metadata> _metadata;
// Optional footer key_value_metadata
std::vector<std::map<std::string, std::string>> _user_data;
// Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
// If true then overrides any per-column setting in _metadata.
bool _write_timestamps_as_int96 = false;
// Parquet writer can write timestamps as UTC
// Defaults to true because libcudf timestamps are implicitly UTC
bool _write_timestamps_as_UTC = true;
Expand Down Expand Up @@ -717,13 +714,6 @@ class parquet_writer_options {
return _user_data;
}

/**
* @brief Returns `true` if timestamps will be written as INT96
*
* @return `true` if timestamps will be written as INT96
*/
bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }

/**
* @brief Returns `true` if timestamps will be written as UTC
*
Expand Down Expand Up @@ -867,14 +857,6 @@ class parquet_writer_options {
*/
void set_compression(compression_type compression) { _compression = compression; }

/**
* @brief Sets timestamp writing preferences. INT96 timestamps will be written
* if `true` and TIMESTAMP_MICROS will be written if `false`.
*
* @param req Boolean value to enable/disable writing of INT96 timestamps
*/
void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }

/**
* @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
*
Expand Down Expand Up @@ -1191,18 +1173,6 @@ class parquet_writer_options_builder {
return *this;
}

/**
* @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
*
* @param enabled Boolean value to enable/disable int96 timestamps
* @return this for chaining
*/
parquet_writer_options_builder& int96_timestamps(bool enabled)
{
options._write_timestamps_as_int96 = enabled;
return *this;
}

/**
* @brief Set to true if timestamps are to be written as UTC.
*
Expand Down Expand Up @@ -1293,9 +1263,6 @@ class chunked_parquet_writer_options {
std::optional<table_input_metadata> _metadata;
// Optional footer key_value_metadata
std::vector<std::map<std::string, std::string>> _user_data;
// Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
// If true then overrides any per-column setting in _metadata.
bool _write_timestamps_as_int96 = false;
// Parquet writer can write timestamps as UTC. Defaults to true.
bool _write_timestamps_as_UTC = true;
// Maximum size of each row group (unless smaller than a single page)
Expand Down Expand Up @@ -1376,13 +1343,6 @@ class chunked_parquet_writer_options {
return _user_data;
}

/**
* @brief Returns `true` if timestamps will be written as INT96
*
* @return `true` if timestamps will be written as INT96
*/
bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }

/**
* @brief Returns `true` if timestamps will be written as UTC
*
Expand Down Expand Up @@ -1509,15 +1469,6 @@ class chunked_parquet_writer_options {
*/
void set_compression(compression_type compression) { _compression = compression; }

/**
* @brief Sets timestamp writing preferences.
*
* INT96 timestamps will be written if `true` and TIMESTAMP_MICROS will be written if `false`.
*
* @param req Boolean value to enable/disable writing of INT96 timestamps
*/
void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }

/**
* @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
*
Expand Down Expand Up @@ -1684,21 +1635,6 @@ class chunked_parquet_writer_options_builder {
return *this;
}

/**
* @brief Set to true if timestamps should be written as
* int96 types instead of int64 types. Even though int96 is deprecated and is
* not an internal type for cudf, it needs to be written for backwards
* compatibility reasons.
*
* @param enabled Boolean value to enable/disable int96 timestamps
* @return this for chaining
*/
chunked_parquet_writer_options_builder& int96_timestamps(bool enabled)
{
options._write_timestamps_as_int96 = enabled;
return *this;
}

/**
* @brief Set to true if timestamps are to be written as UTC.
*
Expand Down
29 changes: 3 additions & 26 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,10 +605,9 @@ class column_in_metadata {
friend table_input_metadata;
std::string _name = "";
std::optional<bool> _nullable;
bool _list_column_is_map = false;
bool _use_int96_timestamp = false;
bool _output_as_binary = false;
bool _skip_compression = false;
bool _list_column_is_map = false;
bool _output_as_binary = false;
bool _skip_compression = false;
std::optional<uint8_t> _decimal_precision;
std::optional<int32_t> _parquet_field_id;
std::optional<int32_t> _type_length;
Expand Down Expand Up @@ -672,20 +671,6 @@ class column_in_metadata {
return *this;
}

/**
* @brief Specifies whether this timestamp column should be encoded using the deprecated int96
* physical type. Only valid for the following column types:
* timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
*
* @param req True = use int96 physical type. False = use int64 physical type
* @return this for chaining
*/
column_in_metadata& set_int96_timestamps(bool req) noexcept
{
_use_int96_timestamp = req;
return *this;
}

/**
* @brief Set the decimal precision of this column. Only valid if this column is a decimal
* (fixed-point) type
Expand Down Expand Up @@ -818,14 +803,6 @@ class column_in_metadata {
*/
[[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }

/**
* @brief Get whether to encode this timestamp column using deprecated int96 physical type
*
* @return Boolean indicating whether to encode this timestamp column using deprecated int96
* physical type
*/
[[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }

/**
* @brief Get whether precision has been set for this decimal column
*
Expand Down
1 change: 0 additions & 1 deletion cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
switch (col->physical_type) {
case Type::INT32: return 4;
case Type::INT64: return 8;
case Type::INT96: return 12;
case Type::FLOAT: return 4;
case Type::DOUBLE: return 8;
case Type::BYTE_ARRAY: {
Expand Down
60 changes: 0 additions & 60 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_le
return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
}
switch (physical_type) {
case INT96: return 12u;
case INT64:
case DOUBLE: return sizeof(int64_t);
case BOOLEAN: return 1u;
Expand Down Expand Up @@ -545,7 +544,6 @@ __device__ size_t delta_data_len(Type physical_type,
auto const dtype_len_out = physical_type_len(physical_type, type_id, sizeof(int32_t));
auto const dtype_len = [&]() -> uint32_t {
if (physical_type == INT32) { return int32_logical_len(type_id); }
if (physical_type == INT96) { return sizeof(int64_t); }
return dtype_len_out;
}();

Expand Down Expand Up @@ -1324,27 +1322,6 @@ constexpr auto julian_calendar_epoch_diff()
return sys_days{January / 1 / 1970} - (sys_days{November / 24 / -4713} + 12h);
}

/**
* @brief Converts number `v` of periods of type `PeriodT` into a pair with nanoseconds since
* midnight and number of Julian days. Does not deal with time zones. Used by INT96 code.
*
* @tparam PeriodT a ratio representing the tick period in duration
* @param v count of ticks since epoch
* @return A pair of (nanoseconds, days) where nanoseconds is the number of nanoseconds
* elapsed in the day and days is the number of days from Julian epoch.
*/
template <typename PeriodT>
__device__ auto julian_days_with_time(int64_t v)
{
using namespace cuda::std::chrono;
auto const dur_total = duration<int64_t, PeriodT>{v};
auto const dur_days = floor<days>(dur_total);
auto const dur_time_of_day = dur_total - dur_days;
auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
auto const julian_days = dur_days + ceil<days>(julian_calendar_epoch_diff());
return std::make_pair(dur_time_of_day_nanos, julian_days);
}

// this has been split out into its own kernel because of the amount of shared memory required
// for the state buffer. encode kernels that don't use the RLE buffer can get started while
// the level data is encoded.
Expand Down Expand Up @@ -1666,7 +1643,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
auto const dtype_len_in = [&]() -> uint32_t {
if (physical_type == INT32) { return int32_logical_len(type_id); }
if (physical_type == INT96) { return sizeof(int64_t); }
return dtype_len_out;
}();

Expand Down Expand Up @@ -1770,40 +1746,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
}
encode_value(dst + pos, v, stride);
} break;
case INT96: {
// only PLAIN encoding is supported
int64_t v = s->col.leaf_column->element<int64_t>(val_idx);
int32_t ts_scale = s->col.ts_scale;
if (ts_scale != 0) {
if (ts_scale < 0) {
v /= -ts_scale;
} else {
v *= ts_scale;
}
}

auto const [last_day_nanos, julian_days] = [&] {
using namespace cuda::std::chrono;
switch (s->col.leaf_column->type().id()) {
case type_id::TIMESTAMP_SECONDS:
case type_id::TIMESTAMP_MILLISECONDS: {
return julian_days_with_time<cuda::std::milli>(v);
} break;
case type_id::TIMESTAMP_MICROSECONDS:
case type_id::TIMESTAMP_NANOSECONDS: {
return julian_days_with_time<cuda::std::micro>(v);
} break;
}
return julian_days_with_time<cuda::std::nano>(0);
}();

// the 12 bytes of fixed length data.
v = last_day_nanos.count();
encode_value(dst + pos, v, 1);
uint32_t w = julian_days.count();
encode_value(dst + pos + 8, w, 1);
} break;

case BYTE_ARRAY: {
// only PLAIN encoding is supported
auto const bytes = [](cudf::type_id const type_id,
Expand Down Expand Up @@ -1901,7 +1843,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
auto const dtype_len_in = [&]() -> uint32_t {
if (physical_type == INT32) { return int32_logical_len(type_id); }
if (physical_type == INT96) { return sizeof(int64_t); }
return dtype_len_out;
}();

Expand Down Expand Up @@ -2033,7 +1974,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
auto const dtype_len_in = [&]() -> uint32_t {
if (physical_type == INT32) { return int32_logical_len(type_id); }
if (physical_type == INT96) { return sizeof(int64_t); }
return dtype_len_out;
}();

Expand Down
4 changes: 0 additions & 4 deletions cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,6 @@ struct stats_caster {
switch (type) {
case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
case INT64: return targetType<T>(*reinterpret_cast<int64_t const*>(stats_val));
case INT96: // Deprecated in parquet specification
return targetType<T>(static_cast<__int128_t>(reinterpret_cast<int64_t const*>(stats_val)[0])
<< 32 |
reinterpret_cast<int32_t const*>(stats_val)[2]);
case BYTE_ARRAY: [[fallthrough]];
case FIXED_LEN_BYTE_ARRAY:
if (stats_size == sizeof(T)) {
Expand Down
Loading
Loading