Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for decimal types in ORC writer #8198

Merged
merged 40 commits into from
May 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
cd18773
placeholder comments
vuule Apr 22, 2021
5b84898
encoded size calc first impl
vuule Apr 23, 2021
c49f497
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule Apr 23, 2021
d6c0e38
add per-rowgroup scan; gather rowgroup sizes
vuule Apr 24, 2021
250557b
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule Apr 26, 2021
c62ea6e
work in progress
vuule Apr 26, 2021
04ea51c
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule Apr 27, 2021
7e1b88c
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule Apr 27, 2021
75b0228
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule Apr 27, 2021
0e93c6e
use actual rowgroup size
vuule Apr 27, 2021
6d89186
keep decimal sizes in maps
vuule Apr 27, 2021
1995ffa
aggregated column sizes; set decimal stream sizes
vuule Apr 29, 2021
d91890a
null support in size calc
vuule Apr 30, 2021
1d090b7
rename str_data_size; sink decimal sizes into encode_columns
vuule Apr 30, 2021
68321d7
add chunk streams initialization
vuule Apr 30, 2021
ee2ce7c
add decimal data to non_rle data size
vuule Apr 30, 2021
f6bd00b
stats type
vuule May 1, 2021
9e6c250
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 1, 2021
f642a60
write scale to schema
vuule May 3, 2021
4d7d68f
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 3, 2021
7db2239
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 4, 2021
b08228a
set chunk/column scale for decimal
vuule May 5, 2021
15d116d
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 6, 2021
f2069a9
scale encoding
vuule May 6, 2021
88e6643
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 7, 2021
c9d8369
somewhat working encode
vuule May 7, 2021
ab58409
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 7, 2021
9e438bd
scale fix
vuule May 7, 2021
9ab5bad
bug fixes
vuule May 10, 2021
c3675f5
fix index streams issue
vuule May 11, 2021
e63d29e
write precision to schema
vuule May 11, 2021
b9fc006
decimal32
vuule May 11, 2021
1e5a52e
move test to c++
vuule May 12, 2021
881fb5a
add decimal to existing tests
vuule May 12, 2021
654f208
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 12, 2021
9b6690e
add missing import
vuule May 12, 2021
9dfc059
polish
vuule May 12, 2021
5157d12
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into f…
vuule May 13, 2021
4aab543
code review pt. 1
vuule May 14, 2021
5053f37
reviews pt2
vuule May 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
if (data_blk >= 0) { sz += put_uint(data_blk); }
if (data_ofs >= 0) {
sz += put_uint(data_ofs);
if (kind != STRING && kind != FLOAT && kind != DOUBLE) {
if (kind != STRING && kind != FLOAT && kind != DOUBLE && kind != DECIMAL) {
putb(0); // RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz++;
if (kind == BOOLEAN) {
Expand Down Expand Up @@ -293,8 +293,8 @@ size_t ProtobufWriter::write(const SchemaType &s)
w.field_packed_uint(2, s.subtypes);
w.field_repeated_string(3, s.fieldNames);
// w.field_uint(4, s.maximumLength);
// w.field_uint(5, s.precision);
// w.field_uint(6, s.scale);
if (s.precision) w.field_uint(5, *s.precision);
if (s.scale) w.field_uint(6, *s.scale);
return w.value();
}

Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/orc/orc.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ struct SchemaType {
TypeKind kind = INVALID_TYPE_KIND; // the kind of this type
std::vector<uint32_t> subtypes; // the type ids of any subcolumns for list, map, struct, or union
std::vector<std::string> fieldNames; // the list of field names for struct
uint32_t maximumLength =
0; // optional: the maximum length of the type for varchar or char in UTF-8 characters
uint32_t precision = 0; // optional: the precision and scale for decimal
uint32_t scale = 0;
std::optional<uint32_t>
maximumLength; // the maximum length of the type for varchar or char in UTF-8 characters
std::optional<uint32_t> precision; // the precision for decimal
std::optional<uint32_t> scale; // the scale for decimal
};

struct UserMetadataItem {
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ struct ColumnDesc {
uint8_t encoding_kind; // column encoding kind (orc::ColumnEncodingKind)
uint8_t type_kind; // column data type (orc::TypeKind)
uint8_t dtype_len; // data type length (for types that can be mapped to different sizes)
uint8_t decimal_scale; // number of fractional decimal digits for decimal type (bit 7 set if
// converting to float64)
int32_t decimal_scale; // number of fractional decimal digits for decimal type
int32_t ts_clock_rate; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
};

Expand All @@ -122,9 +121,10 @@ struct EncChunk {
uint8_t encoding_kind; // column encoding kind (orc::ColumnEncodingKind)
uint8_t type_kind; // column data type (orc::TypeKind)
uint8_t dtype_len; // data type length
uint8_t scale; // scale for decimals or timestamps
int32_t scale; // scale for decimals or timestamps

uint32_t *dict_index; // dictionary index from row index
device_span<uint32_t> decimal_offsets;
column_device_view *leaf_column;
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
// sign of the scale is changed since cuDF follows c++ libraries like CNL
// which uses negative scaling, but liborc and other libraries
// follow positive scaling.
auto const scale = -static_cast<int32_t>(_metadata->ff.types[col].scale);
auto const scale = -static_cast<int32_t>(_metadata->ff.types[col].scale.value_or(0));
column_types.emplace_back(col_type, scale);
} else {
column_types.emplace_back(col_type);
Expand Down Expand Up @@ -526,7 +526,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
chunk.num_rows = stripe_info->numberOfRows;
chunk.encoding_kind = stripe_footer->columns[_selected_columns[j]].kind;
chunk.type_kind = _metadata->ff.types[_selected_columns[j]].kind;
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale.value_or(0);
chunk.rowgroup_id = num_rowgroups;
chunk.dtype_len = (column_types[j].id() == type_id::STRING)
? sizeof(std::pair<const char *, size_t>)
Expand Down
21 changes: 19 additions & 2 deletions cpp/src/io/orc/stripe_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,9 @@ __global__ void __launch_bounds__(block_size)
s->lengths.u32[nz_idx] = value.size_bytes();
}
break;
// Reusing the lengths array for the scale stream
// Note: can be written in a faster manner, given that all values are equal
case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break;
default: break;
}
}
Expand Down Expand Up @@ -814,7 +817,7 @@ __global__ void __launch_bounds__(block_size)
uint32_t nz = s->buf.u32[511];
s->nnz += nz;
s->numvals += nz;
s->numlengths += (s->chunk.type_kind == TIMESTAMP ||
s->numlengths += (s->chunk.type_kind == TIMESTAMP || s->chunk.type_kind == DECIMAL ||
(s->chunk.type_kind == STRING && s->chunk.encoding_kind != DICTIONARY_V2))
? nz
: 0;
Expand Down Expand Up @@ -865,6 +868,17 @@ __global__ void __launch_bounds__(block_size)
n = s->numvals;
}
break;
case DECIMAL: {
if (valid) {
uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32)
? zigzag(s->chunk.leaf_column->element<int32_t>(row))
: zigzag(s->chunk.leaf_column->element<int64_t>(row));
auto const offset =
(row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
}
n = s->numvals;
} break;
default: n = s->numvals; break;
}
__syncthreads();
Expand All @@ -878,6 +892,7 @@ __global__ void __launch_bounds__(block_size)
n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
break;
case DECIMAL:
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved
case STRING:
n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
Expand All @@ -893,7 +908,9 @@ __global__ void __launch_bounds__(block_size)
__syncthreads();
if (t <= CI_PRESENT && s->stream.ids[t] >= 0) {
// Update actual compressed length
streams[col_id][group_id].lengths[t] = s->strm_pos[t];
// (not needed for decimal data, whose exact size is known before encode)
if (!(t == CI_DATA && s->chunk.type_kind == DECIMAL))
streams[col_id][group_id].lengths[t] = s->strm_pos[t];
if (!s->stream.data_ptrs[t]) {
streams[col_id][group_id].data_ptrs[t] =
static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
Expand Down
Loading