-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add low memory JSON reader for cudf.pandas
#16204
Merged
Merged
Changes from 3 commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
ed86600
Implement chunked json reader
galipremsagar df6266d
Merge
galipremsagar 8427e4a
add tests
galipremsagar 8a81ad5
merge
galipremsagar 04dca59
Merge remote-tracking branch 'upstream/branch-24.08' into 16122
galipremsagar a007761
fix syntax
galipremsagar 1bf5569
move common code together
galipremsagar 872a1fe
move common code together
galipremsagar a958de1
Merge branch 'branch-24.08' into 16122
galipremsagar bf2578e
update docstring
galipremsagar 9bcc074
Merge branch 'branch-24.08' into 16122
galipremsagar 5dbb708
Merge branch 'branch-24.08' into 16122
galipremsagar 64992f8
Update python/cudf/cudf/_lib/pylibcudf/io/json.pyx
galipremsagar 9bbe58e
Merge branch 'branch-24.08' into 16122
galipremsagar a70d841
Update json.pyx
galipremsagar File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,9 +27,14 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport ( | |
) | ||
from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type | ||
from cudf._lib.types cimport dtype_to_data_type | ||
from cudf._lib.utils cimport data_from_unique_ptr | ||
from cudf._lib.utils cimport ( | ||
columns_from_unique_ptr, | ||
data_from_pylibcudf_table, | ||
data_from_unique_ptr, | ||
) | ||
|
||
import cudf._lib.pylibcudf as plc | ||
from cudf._lib.concat import concat_columns | ||
|
||
|
||
cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): | ||
|
@@ -40,25 +45,17 @@ cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): | |
else: | ||
raise TypeError(f"Invalid parameter for {on_bad_lines=}") | ||
|
||
|
||
cpdef read_json(object filepaths_or_buffers, | ||
object dtype, | ||
bool lines, | ||
object compression, | ||
object byte_range, | ||
bool keep_quotes, | ||
bool mixed_types_as_string, | ||
bool prune_columns, | ||
object on_bad_lines): | ||
""" | ||
Cython function to call into libcudf API, see `read_json`. | ||
|
||
See Also | ||
-------- | ||
cudf.io.json.read_json | ||
cudf.io.json.to_json | ||
""" | ||
|
||
cdef json_reader_options _setup_json_reader_options( | ||
object filepaths_or_buffers, | ||
object dtype, | ||
object compression, | ||
bool keep_quotes, | ||
bool mixed_types_as_string, | ||
bool prune_columns, | ||
object on_bad_lines, | ||
bool lines, | ||
size_type byte_range_offset, | ||
size_type byte_range_size): | ||
# If input data is a JSON string (or StringIO), hold a reference to | ||
# the encoded memoryview externally to ensure the encoded buffer | ||
# isn't destroyed before calling libcudf `read_json()` | ||
|
@@ -74,14 +71,6 @@ cpdef read_json(object filepaths_or_buffers, | |
cdef vector[data_type] c_dtypes_list | ||
cdef map[string, schema_element] c_dtypes_schema_map | ||
cdef cudf_io_types.compression_type c_compression | ||
# Determine byte read offsets if applicable | ||
cdef size_type c_range_offset = ( | ||
byte_range[0] if byte_range is not None else 0 | ||
) | ||
cdef size_type c_range_size = ( | ||
byte_range[1] if byte_range is not None else 0 | ||
) | ||
cdef bool c_lines = lines | ||
|
||
if compression is not None: | ||
if compression == 'gzip': | ||
|
@@ -115,9 +104,9 @@ cpdef read_json(object filepaths_or_buffers, | |
cdef json_reader_options opts = move( | ||
json_reader_options.builder(make_source_info(filepaths_or_buffers)) | ||
.compression(c_compression) | ||
.lines(c_lines) | ||
.byte_range_offset(c_range_offset) | ||
.byte_range_size(c_range_size) | ||
.lines(lines) | ||
.byte_range_offset(byte_range_offset) | ||
.byte_range_size(byte_range_size) | ||
.recovery_mode(_get_json_recovery_mode(on_bad_lines)) | ||
.build() | ||
) | ||
|
@@ -130,6 +119,38 @@ cpdef read_json(object filepaths_or_buffers, | |
opts.enable_mixed_types_as_string(mixed_types_as_string) | ||
opts.enable_prune_columns(prune_columns) | ||
|
||
return opts | ||
|
||
cpdef read_json(object filepaths_or_buffers, | ||
object dtype, | ||
bool lines, | ||
object compression, | ||
object byte_range, | ||
bool keep_quotes, | ||
bool mixed_types_as_string, | ||
bool prune_columns, | ||
object on_bad_lines): | ||
""" | ||
Cython function to call into libcudf API, see `read_json`. | ||
|
||
See Also | ||
-------- | ||
cudf.io.json.read_json | ||
cudf.io.json.to_json | ||
""" | ||
# Determine byte read offsets if applicable | ||
cdef size_type c_range_offset = ( | ||
byte_range[0] if byte_range is not None else 0 | ||
) | ||
cdef size_type c_range_size = ( | ||
byte_range[1] if byte_range is not None else 0 | ||
) | ||
cdef json_reader_options opts = _setup_json_reader_options( | ||
filepaths_or_buffers, dtype, compression, keep_quotes, | ||
mixed_types_as_string, prune_columns, on_bad_lines, | ||
lines, c_range_offset, c_range_size | ||
) | ||
|
||
# Read JSON | ||
cdef cudf_io_types.table_with_metadata c_result | ||
|
||
|
@@ -146,6 +167,71 @@ cpdef read_json(object filepaths_or_buffers, | |
|
||
return df | ||
|
||
cpdef chunked_read_json(object filepaths_or_buffers, | ||
object dtype, | ||
object compression, | ||
bool keep_quotes, | ||
bool mixed_types_as_string, | ||
bool prune_columns, | ||
object on_bad_lines, | ||
int chunk_size=100_000_000): | ||
""" | ||
Cython function to call into libcudf API, see `read_json`. | ||
|
||
See Also | ||
-------- | ||
cudf.io.json.read_json | ||
cudf.io.json.to_json | ||
""" | ||
cdef size_type c_range_size = ( | ||
chunk_size if chunk_size is not None else 0 | ||
) | ||
cdef json_reader_options opts = _setup_json_reader_options( | ||
filepaths_or_buffers, dtype, compression, keep_quotes, | ||
mixed_types_as_string, prune_columns, on_bad_lines, | ||
True, 0, c_range_size | ||
) | ||
|
||
# Read JSON | ||
cdef cudf_io_types.table_with_metadata c_result | ||
final_columns = [] | ||
meta_names = None | ||
i = 0 | ||
while True: | ||
opts.set_byte_range_offset(c_range_size * i) | ||
opts.set_byte_range_size(c_range_size) | ||
|
||
try: | ||
with nogil: | ||
c_result = move(libcudf_read_json(opts)) | ||
except (ValueError, OverflowError): | ||
break | ||
if meta_names is None: | ||
meta_names = [info.name.decode() for info in c_result.metadata.schema_info] | ||
new_chunk = columns_from_unique_ptr(move(c_result.tbl)) | ||
if len(final_columns) == 0: | ||
final_columns = new_chunk | ||
else: | ||
for col_idx in range(len(meta_names)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just confirming that the concatenation technique here is generally the same as done in the parquet reader? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yup |
||
final_columns[col_idx] = concat_columns( | ||
[final_columns[col_idx], new_chunk[col_idx]] | ||
) | ||
# Must drop any residual GPU columns to save memory | ||
new_chunk[col_idx] = None | ||
i += 1 | ||
df = cudf.DataFrame._from_data( | ||
*data_from_pylibcudf_table( | ||
plc.Table( | ||
[col.to_pylibcudf(mode="read") for col in final_columns] | ||
), | ||
column_names=meta_names, | ||
index_names=None | ||
) | ||
) | ||
update_struct_field_names(df, c_result.metadata.schema_info) | ||
|
||
return df | ||
|
||
|
||
@acquire_spill_lock() | ||
def write_json( | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just curious if we could use some logic such as
_has_next()
in PQ chunked reader to break this loop instead of this exception?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like that we're catching the exception from a datasource here. The memory mapping is very much an implementation detail.
How expensive would it be to get the total source(s) size? Then we can loop until all of it is read.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is expensive, we basically have to seek to the end of file to get the size of all data sources. For remote data-sources it get's complicated to properly perform seek too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We basically call into libcudf layer for that, is there any such provision for json reader in libcudf?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, we already need the file size(s) to read JSON input(s). With the current implementation of the low memory JSON reader, we get the size of each input file for each byte range, so getting the sizes one more time to have a cleaner loop would not add much.