Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader #13437

Merged
merged 34 commits into from
Aug 24, 2023
Merged
Changes from 2 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
138cf3f
Read fixed len byte array as string
PointKernel May 24, 2023
f39a02f
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel May 24, 2023
64783d2
Merge with upstream
PointKernel Jul 3, 2023
fdc8b7e
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel Jul 3, 2023
c7b32b2
TEST
PointKernel Jul 5, 2023
2c685a2
Cleanups
PointKernel Jul 5, 2023
cb6825f
Revert test changes
PointKernel Jul 5, 2023
c75675b
Cleanups
PointKernel Jul 5, 2023
fac2eff
Set up dtype_len properly
PointKernel Jul 5, 2023
86e41a7
updates
PointKernel Jul 5, 2023
7f363b1
Cleanup
PointKernel Jul 5, 2023
70ef046
Use switch instead of if conditions
PointKernel Jul 5, 2023
8eee919
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel Jul 5, 2023
de90782
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel Aug 10, 2023
b393b7b
Fix a bug in string col determination
PointKernel Aug 10, 2023
aab1ced
Fix a bug in string col determination
PointKernel Aug 10, 2023
eeafbf7
Fix a bug: break non-default switchcase
PointKernel Aug 10, 2023
7209027
Cleanups: shortcircuit for FIXED_LEN_BYTE_ARRAY
PointKernel Aug 10, 2023
ac3b7a4
Read FIXED_LEN_BYTE_ARRAY always as binary
PointKernel Aug 11, 2023
a2a622d
Add FIXED_LEN_BYTE_ARRAY pytest
PointKernel Aug 11, 2023
5d01183
Minor fix: compare the lower 3 bits only
PointKernel Aug 11, 2023
9f5ff20
Merge branch 'branch-23.10' into parquet-fixed-len-binary
PointKernel Aug 14, 2023
50ec387
Use a smaller test file
PointKernel Aug 16, 2023
f756e56
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel Aug 16, 2023
6304001
Merge branch 'parquet-fixed-len-binary' of github.com:PointKernel/cud…
PointKernel Aug 16, 2023
a969c00
Minor cleanup: init right after declaration to get rid of if branches
PointKernel Aug 17, 2023
277557c
Cleanups: descriptive name for intermediate vars
PointKernel Aug 17, 2023
b712f5f
Add comment
PointKernel Aug 17, 2023
efc495d
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel Aug 17, 2023
2427416
Update cpp/src/io/parquet/reader_impl.cpp
PointKernel Aug 17, 2023
77b3d38
Merge branch 'branch-23.10' into parquet-fixed-len-binary
PointKernel Aug 21, 2023
6453662
Merge branch 'branch-23.10' into parquet-fixed-len-binary
vuule Aug 23, 2023
dfdcb76
Merge branch 'branch-23.10' into parquet-fixed-len-binary
vuule Aug 24, 2023
ed1c81c
Update python/cudf/cudf/tests/test_parquet.py
PointKernel Aug 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 33 additions & 17 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -531,12 +531,20 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,

while (pos < target_pos) {
int len;
if (k + 4 <= dict_size) {
len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
k += 4;
if (k + len > dict_size) { len = 0; }
if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
if (k < dict_size) {
len = s->dtype_len_in;
} else {
len = 0;
}
} else {
len = 0;
if (k + 4 <= dict_size) {
len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
k += 4;
if (k + len > dict_size) { len = 0; }
} else {
len = 0;
}
}
if constexpr (!sizes_only) {
sb->dict_idx[rolling_index(pos)] = k;
Expand Down Expand Up @@ -603,7 +611,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
void* dstv)
{
auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
if (s->dtype_len == 4) {
if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this really be a logical and? It seems in the past we would take this path if the data size was 4 bytes, but now we only do it if the data size if 4 bytes AND it is a byte array.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this PR, only BYTE_ARRAY invokes gpuOutputString and the input length cannot be 4 in that case. Now with function being potentially invoked by FIXED_LEN_BYTE_ARRAY where the length could be 4, this and logic is needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a comment along the lines of "// make sure to only hash variable length byte arrays when specified with the output type size"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really related to this PR, but we have this dtype_len == 4 condition as a stand-in for "output hash" in a few places and it really requires detailed knowledge of the code to understand. Nothing actionable, I just don't like it :D

// Output hash. This hash value is used if the option to convert strings to
// categoricals is enabled. The seed value is chosen arbitrarily.
uint32_t constexpr hash_seed = 33;
Expand Down Expand Up @@ -1208,7 +1216,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
case Encoding::PLAIN_DICTIONARY:
case Encoding::RLE_DICTIONARY:
// RLE-packed dictionary indices, first byte indicates index length in bits
if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
if ((data_type == BYTE_ARRAY) && (s->col.str_dict_index)) {
// String dictionary: use index
s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
Expand All @@ -1225,7 +1233,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
case Encoding::PLAIN:
s->dict_size = static_cast<int32_t>(end - cur);
s->dict_val = 0;
if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
if (data_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
break;
case Encoding::RLE: s->dict_run = 0; break;
default:
Expand Down Expand Up @@ -2027,11 +2035,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
return;
}

auto const data_type = s->col.data_type & 7;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I liked seeing this change above to use this type, but why not move this above and use this variable all around?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data_type is not used that often in other functions. Let me have a further look.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO ColumnChunkDesc should have data_type() and type_len() so the bitwise packing madness is hidden from the rest of the code.
data_type & 7 appears 11 times in the code :\

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use parquet::Type directly instead of uint16_t?

uint16_t data_type; // basic column data type, ((type_length << 3) |
// parquet::Type)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(for some reason) data_type holds the type in the last three bits and length in the rest of the number. Using Type here would make this data member even more misleading.


if (s->dict_base) {
out_thread0 = (s->dict_bits > 0) ? 64 : 32;
} else {
out_thread0 =
((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
(data_type == BOOLEAN || data_type == BYTE_ARRAY || data_type == FIXED_LEN_BYTE_ARRAY) ? 64
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
: 32;
}

PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
Expand Down Expand Up @@ -2066,15 +2077,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
// WARP1: Decode dictionary indices, booleans or string positions
if (s->dict_base) {
src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
} else if ((s->col.data_type & 7) == BOOLEAN) {
} else if (data_type == BOOLEAN) {
src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
} else if ((s->col.data_type & 7) == BYTE_ARRAY) {
} else if (data_type == BYTE_ARRAY or data_type == FIXED_LEN_BYTE_ARRAY) {
gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
}
if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
} else {
// WARP1..WARP3: Decode values
int dtype = s->col.data_type & 7;
src_pos += t - out_thread0;

// the position in the output column/buffer
Expand Down Expand Up @@ -2105,10 +2115,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
// nesting level that is storing actual leaf values
int leaf_level_index = s->col.max_nesting_depth - 1;

uint32_t dtype_len = s->dtype_len;
auto const dtype_len = s->dtype_len;
void* dst =
nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
if (dtype == BYTE_ARRAY) {
if (data_type == BYTE_ARRAY) {
if (s->col.converted_type == DECIMAL) {
auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos);
auto const decimal_precision = s->col.decimal_precision;
Expand All @@ -2122,10 +2132,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
} else {
gpuOutputString(s, sb, val_src_pos, dst);
}
} else if (dtype == BOOLEAN) {
} else if (data_type == BOOLEAN) {
gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst));
} else if (s->col.converted_type == DECIMAL) {
switch (dtype) {
switch (data_type) {
case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break;
case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break;
default:
Expand All @@ -2138,7 +2148,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
}
break;
}
} else if (dtype == INT96) {
} else if (data_type == FIXED_LEN_BYTE_ARRAY) {
gpuOutputString(s,
sb,
val_src_pos,
nesting_info_base[leaf_level_index].data_out +
static_cast<size_t>(dst_pos) * sizeof(string_index_pair));
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
} else if (data_type == INT96) {
gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
} else if (dtype_len == 8) {
if (s->dtype_len_in == 4) {
Expand Down