Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable proper Index round-tripping in orc reader and writer #10170

Merged
merged 7 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 81 additions & 2 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ from libcpp.memory cimport make_unique, unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector
from collections import OrderedDict

try:
import ujson as json
except ImportError:
import json

cimport cudf._lib.cpp.io.types as cudf_io_types
from cudf._lib.column cimport Column
Expand Down Expand Up @@ -123,8 +129,22 @@ cpdef read_orc(object filepaths_or_buffers,
c_result = move(libcudf_read_orc(c_orc_reader_options))

names = [name.decode() for name in c_result.metadata.column_names]
actual_index_names, names, is_range_index, reset_index_name, range_idx = \
_get_index_from_metadata(c_result.metadata.user_data,
names,
skip_rows,
num_rows)

data, index = data_from_unique_ptr(
move(c_result.tbl),
names,
actual_index_names
)

data, index = data_from_unique_ptr(move(c_result.tbl), names)
if is_range_index:
index = range_idx
elif reset_index_name:
index.names = [None] * len(index.names)
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

data = {
name: update_column_struct_field_names(
Expand All @@ -144,6 +164,60 @@ cdef compression_type _get_comp_type(object compression):
else:
raise ValueError(f"Unsupported `compression` type {compression}")

cdef tuple _get_index_from_metadata(
map[string, string] user_data,
object names,
object skip_rows,
object num_rows):
json_str = user_data[b'pandas'].decode('utf-8')
meta = None
index_col = None
is_range_index = False
reset_index_name = False
range_idx = None
if json_str != "":
meta = json.loads(json_str)

if 'index_columns' in meta and len(meta['index_columns']) > 0:
index_col = meta['index_columns']
if isinstance(index_col[0], dict) and \
index_col[0]['kind'] == 'range':
is_range_index = True
else:
index_col_names = OrderedDict()
for idx_col in index_col:
for c in meta['columns']:
if c['field_name'] == idx_col:
index_col_names[idx_col] = \
c['name'] or c['field_name']
if c['name'] is None:
reset_index_name = True

actual_index_names = None
if index_col is not None and len(index_col) > 0:
if is_range_index:
range_index_meta = index_col[0]
range_idx = cudf.RangeIndex(
start=range_index_meta['start'],
stop=range_index_meta['stop'],
step=range_index_meta['step'],
name=range_index_meta['name']
)
if skip_rows is not None:
range_idx = range_idx[skip_rows:]
if num_rows is not None:
range_idx = range_idx[:num_rows]
else:
actual_index_names = list(index_col_names.values())
names = names[len(actual_index_names):]

return (
actual_index_names,
names,
is_range_index,
reset_index_name,
range_idx
)

cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
"""
Expand Down Expand Up @@ -180,6 +254,10 @@ cpdef write_orc(table,
cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
cdef unique_ptr[table_input_metadata] tbl_meta
cdef map[string, string] user_data
user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
table, None)
)

if not isinstance(table._index, cudf.RangeIndex):
tv = table_view_from_table(table)
Expand All @@ -204,8 +282,9 @@ cpdef write_orc(table,

cdef orc_writer_options c_orc_writer_options = move(
orc_writer_options.builder(
sink_info_c, table_view_from_table(table, ignore_index=True)
sink_info_c, tv
).metadata(tbl_meta.get())
.key_value_metadata(move(user_data))
.compression(compression_)
.enable_statistics(_get_orc_stat_freq(statistics))
.build()
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ def test_orc_writer_sliced(tmpdir):
df_select = df.iloc[1:3]

df_select.to_orc(cudf_path)
assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))
assert_eq(cudf.read_orc(cudf_path), df_select)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -794,7 +794,8 @@ def test_orc_bool_encode_fail():

# Also validate data
pdf = pa.orc.ORCFile(buffer).read().to_pandas()
assert_eq(okay_df, pdf)

assert_eq(okay_df.to_pandas(nullable=True), pdf)


def test_nanoseconds_overflow():
Expand Down Expand Up @@ -840,7 +841,12 @@ def test_empty_string_columns(data):
got_df = cudf.read_orc(buffer)

assert_eq(expected, got_df)
assert_eq(expected_pdf, got_df)
assert_eq(
expected_pdf,
got_df.to_pandas(nullable=True)
if expected_pdf["string"].dtype == pd.StringDtype()
else got_df,
)


@pytest.mark.parametrize("scale", [-3, 0, 3])
Expand Down