-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader #13437
Merged
rapids-bot
merged 34 commits into
rapidsai:branch-23.10
from
PointKernel:parquet-fixed-len-binary
Aug 24, 2023
Merged
Changes from 29 commits
Commits
Show all changes
34 commits
Select commit
Hold shift + click to select a range
138cf3f
Read fixed len byte array as string
PointKernel f39a02f
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel 64783d2
Merge with upstream
PointKernel fdc8b7e
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel c7b32b2
TEST
PointKernel 2c685a2
Cleanups
PointKernel cb6825f
Revert test changes
PointKernel c75675b
Cleanups
PointKernel fac2eff
Set up dtype_len properly
PointKernel 86e41a7
updates
PointKernel 7f363b1
Cleanup
PointKernel 70ef046
Use switch instead of if conditions
PointKernel 8eee919
Merge remote-tracking branch 'upstream/branch-23.08' into parquet-fix…
PointKernel de90782
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel b393b7b
Fix a bug in string col determination
PointKernel aab1ced
Fix a bug in string col determination
PointKernel eeafbf7
Fix a bug: break non-default switchcase
PointKernel 7209027
Cleanups: shortcircuit for FIXED_LEN_BYTE_ARRAY
PointKernel ac3b7a4
Read FIXED_LEN_BYTE_ARRAY always as binary
PointKernel a2a622d
Add FIXED_LEN_BYTE_ARRAY pytest
PointKernel 5d01183
Minor fix: compare the lower 3 bits only
PointKernel 9f5ff20
Merge branch 'branch-23.10' into parquet-fixed-len-binary
PointKernel 50ec387
Use a smaller test file
PointKernel f756e56
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel 6304001
Merge branch 'parquet-fixed-len-binary' of github.com:PointKernel/cud…
PointKernel a969c00
Minor cleanup: init right after declaration to get rid of if branches
PointKernel 277557c
Cleanups: descriptive name for intermediate vars
PointKernel b712f5f
Add comment
PointKernel efc495d
Merge remote-tracking branch 'upstream/branch-23.10' into parquet-fix…
PointKernel 2427416
Update cpp/src/io/parquet/reader_impl.cpp
PointKernel 77b3d38
Merge branch 'branch-23.10' into parquet-fixed-len-binary
PointKernel 6453662
Merge branch 'branch-23.10' into parquet-fixed-len-binary
vuule dfdcb76
Merge branch 'branch-23.10' into parquet-fixed-len-binary
vuule ed1c81c
Update python/cudf/cudf/tests/test_parquet.py
PointKernel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,8 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, | |
void* dstv) | ||
{ | ||
auto [ptr, len] = gpuGetStringData(s, sb, src_pos); | ||
if (s->dtype_len == 4) { | ||
// make sure to only hash `BYTE_ARRAY` when specified with the output type size | ||
if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really related to this PR, but we have this |
||
// Output hash. This hash value is used if the option to convert strings to | ||
// categoricals is enabled. The seed value is chosen arbitrarily. | ||
uint32_t constexpr hash_seed = 33; | ||
|
@@ -774,8 +775,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |
if (s->dict_base) { | ||
out_thread0 = (s->dict_bits > 0) ? 64 : 32; | ||
} else { | ||
out_thread0 = | ||
((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32; | ||
switch (s->col.data_type & 7) { | ||
case BOOLEAN: [[fallthrough]]; | ||
case BYTE_ARRAY: [[fallthrough]]; | ||
case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break; | ||
default: out_thread0 = 32; | ||
} | ||
} | ||
|
||
PageNestingDecodeInfo* nesting_info_base = s->nesting_info; | ||
|
@@ -812,7 +817,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |
src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first; | ||
} else if ((s->col.data_type & 7) == BOOLEAN) { | ||
src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); | ||
} else if ((s->col.data_type & 7) == BYTE_ARRAY) { | ||
} else if ((s->col.data_type & 7) == BYTE_ARRAY or | ||
(s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { | ||
gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f); | ||
} | ||
if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } | ||
|
@@ -882,6 +888,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( | |
} | ||
break; | ||
} | ||
} else if (dtype == FIXED_LEN_BYTE_ARRAY) { | ||
gpuOutputString(s, sb, val_src_pos, dst); | ||
} else if (dtype == INT96) { | ||
gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst)); | ||
} else if (dtype_len == 8) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PointKernel marked this conversation as resolved.
Show resolved
Hide resolved
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this really be a logical and? It seems in the past we would take this path if the data size was 4 bytes, but now we only do it if the data size if 4 bytes AND it is a byte array.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before this PR, only
BYTE_ARRAY
invokesgpuOutputString
and the input length cannot be 4 in that case. Now with function being potentially invoked byFIXED_LEN_BYTE_ARRAY
where the length could be 4, thisand
logic is needed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe a comment along the lines of "// make sure to only hash variable length byte arrays when specified with the output type size"?