-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader #13437
Changes from 2 commits
138cf3f
f39a02f
64783d2
fdc8b7e
c7b32b2
2c685a2
cb6825f
c75675b
fac2eff
86e41a7
7f363b1
70ef046
8eee919
de90782
b393b7b
aab1ced
eeafbf7
7209027
ac3b7a4
a2a622d
5d01183
9f5ff20
50ec387
f756e56
6304001
a969c00
277557c
b712f5f
efc495d
2427416
77b3d38
6453662
dfdcb76
ed1c81c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -531,12 +531,20 @@ __device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, | |||||
|
||||||
while (pos < target_pos) { | ||||||
int len; | ||||||
if (k + 4 <= dict_size) { | ||||||
len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); | ||||||
k += 4; | ||||||
if (k + len > dict_size) { len = 0; } | ||||||
if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { | ||||||
if (k < dict_size) { | ||||||
len = s->dtype_len_in; | ||||||
} else { | ||||||
len = 0; | ||||||
} | ||||||
} else { | ||||||
len = 0; | ||||||
if (k + 4 <= dict_size) { | ||||||
len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); | ||||||
k += 4; | ||||||
if (k + len > dict_size) { len = 0; } | ||||||
} else { | ||||||
len = 0; | ||||||
} | ||||||
} | ||||||
if constexpr (!sizes_only) { | ||||||
sb->dict_idx[rolling_index(pos)] = k; | ||||||
|
@@ -603,7 +611,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, | |||||
void* dstv) | ||||||
{ | ||||||
auto [ptr, len] = gpuGetStringData(s, sb, src_pos); | ||||||
if (s->dtype_len == 4) { | ||||||
if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really related to this PR, but we have this |
||||||
// Output hash. This hash value is used if the option to convert strings to | ||||||
// categoricals is enabled. The seed value is chosen arbitrarily. | ||||||
uint32_t constexpr hash_seed = 33; | ||||||
|
@@ -1208,7 +1216,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, | |||||
case Encoding::PLAIN_DICTIONARY: | ||||||
case Encoding::RLE_DICTIONARY: | ||||||
// RLE-packed dictionary indices, first byte indicates index length in bits | ||||||
if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { | ||||||
if ((data_type == BYTE_ARRAY) && (s->col.str_dict_index)) { | ||||||
// String dictionary: use index | ||||||
s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index); | ||||||
s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair); | ||||||
|
@@ -1225,7 +1233,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, | |||||
case Encoding::PLAIN: | ||||||
s->dict_size = static_cast<int32_t>(end - cur); | ||||||
s->dict_val = 0; | ||||||
if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } | ||||||
if (data_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } | ||||||
break; | ||||||
case Encoding::RLE: s->dict_run = 0; break; | ||||||
default: | ||||||
|
@@ -2027,11 +2035,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |||||
return; | ||||||
} | ||||||
|
||||||
auto const data_type = s->col.data_type & 7; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I liked seeing this change above to use this type, but why not move this above and use this variable all around? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO ColumnChunkDesc should have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use cudf/cpp/src/io/parquet/parquet_gpu.hpp Lines 266 to 267 in 41f0caf
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (for some reason) data_type holds the type in the last three bits and length in the rest of the number. Using Type here would make this data member even more misleading. |
||||||
|
||||||
if (s->dict_base) { | ||||||
out_thread0 = (s->dict_bits > 0) ? 64 : 32; | ||||||
} else { | ||||||
out_thread0 = | ||||||
((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32; | ||||||
(data_type == BOOLEAN || data_type == BYTE_ARRAY || data_type == FIXED_LEN_BYTE_ARRAY) ? 64 | ||||||
PointKernel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
: 32; | ||||||
} | ||||||
|
||||||
PageNestingDecodeInfo* nesting_info_base = s->nesting_info; | ||||||
|
@@ -2066,15 +2077,14 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |||||
// WARP1: Decode dictionary indices, booleans or string positions | ||||||
if (s->dict_base) { | ||||||
src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first; | ||||||
} else if ((s->col.data_type & 7) == BOOLEAN) { | ||||||
} else if (data_type == BOOLEAN) { | ||||||
src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); | ||||||
} else if ((s->col.data_type & 7) == BYTE_ARRAY) { | ||||||
} else if (data_type == BYTE_ARRAY or data_type == FIXED_LEN_BYTE_ARRAY) { | ||||||
gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f); | ||||||
} | ||||||
if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } | ||||||
} else { | ||||||
// WARP1..WARP3: Decode values | ||||||
int dtype = s->col.data_type & 7; | ||||||
src_pos += t - out_thread0; | ||||||
|
||||||
// the position in the output column/buffer | ||||||
|
@@ -2105,10 +2115,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |||||
// nesting level that is storing actual leaf values | ||||||
int leaf_level_index = s->col.max_nesting_depth - 1; | ||||||
|
||||||
uint32_t dtype_len = s->dtype_len; | ||||||
auto const dtype_len = s->dtype_len; | ||||||
void* dst = | ||||||
nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len; | ||||||
if (dtype == BYTE_ARRAY) { | ||||||
if (data_type == BYTE_ARRAY) { | ||||||
if (s->col.converted_type == DECIMAL) { | ||||||
auto const [ptr, len] = gpuGetStringData(s, sb, val_src_pos); | ||||||
auto const decimal_precision = s->col.decimal_precision; | ||||||
|
@@ -2122,10 +2132,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |||||
} else { | ||||||
gpuOutputString(s, sb, val_src_pos, dst); | ||||||
} | ||||||
} else if (dtype == BOOLEAN) { | ||||||
} else if (data_type == BOOLEAN) { | ||||||
gpuOutputBoolean(sb, val_src_pos, static_cast<uint8_t*>(dst)); | ||||||
} else if (s->col.converted_type == DECIMAL) { | ||||||
switch (dtype) { | ||||||
switch (data_type) { | ||||||
case INT32: gpuOutputFast(s, sb, val_src_pos, static_cast<uint32_t*>(dst)); break; | ||||||
case INT64: gpuOutputFast(s, sb, val_src_pos, static_cast<uint2*>(dst)); break; | ||||||
default: | ||||||
|
@@ -2138,7 +2148,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |||||
} | ||||||
break; | ||||||
} | ||||||
} else if (dtype == INT96) { | ||||||
} else if (data_type == FIXED_LEN_BYTE_ARRAY) { | ||||||
gpuOutputString(s, | ||||||
sb, | ||||||
val_src_pos, | ||||||
nesting_info_base[leaf_level_index].data_out + | ||||||
static_cast<size_t>(dst_pos) * sizeof(string_index_pair)); | ||||||
PointKernel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
} else if (data_type == INT96) { | ||||||
gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst)); | ||||||
} else if (dtype_len == 8) { | ||||||
if (s->dtype_len_in == 4) { | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this really be a logical and? It seems in the past we would take this path if the data size was 4 bytes, but now we only do it if the data size if 4 bytes AND it is a byte array.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before this PR, only
BYTE_ARRAY
invokesgpuOutputString
and the input length cannot be 4 in that case. Now with function being potentially invoked byFIXED_LEN_BYTE_ARRAY
where the length could be 4, thisand
logic is needed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe a comment along the lines of "// make sure to only hash variable length byte arrays when specified with the output type size"?