Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable proper Index round-tripping in orc reader and writer #10170

Merged
merged 7 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 64 additions & 2 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ from libcpp.memory cimport make_unique, unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector
from collections import OrderedDict

try:
import ujson as json
except ImportError:
import json

cimport cudf._lib.cpp.io.types as cudf_io_types
from cudf._lib.column cimport Column
Expand Down Expand Up @@ -123,8 +129,59 @@ cpdef read_orc(object filepaths_or_buffers,
c_result = move(libcudf_read_orc(c_orc_reader_options))

names = [name.decode() for name in c_result.metadata.column_names]
cdef map[string, string] user_data = c_result.metadata.user_data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would be good to place this code in a separate function, not sure.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies for the delay, moved this to a separate function.

json_str = user_data[b'pandas'].decode('utf-8')
meta = None
index_col = None
is_range_index = False
reset_index_name = False
range_idx = None
if json_str != "":
meta = json.loads(json_str)

if 'index_columns' in meta and len(meta['index_columns']) > 0:
index_col = meta['index_columns']
if isinstance(index_col[0], dict) and \
index_col[0]['kind'] == 'range':
is_range_index = True
else:
index_col_names = OrderedDict()
for idx_col in index_col:
for c in meta['columns']:
if c['field_name'] == idx_col:
index_col_names[idx_col] = \
c['name'] or c['field_name']
if c['name'] is None:
reset_index_name = True

actual_index_names = None
if index_col is not None and len(index_col) > 0:
if is_range_index:
range_index_meta = index_col[0]
range_idx = cudf.RangeIndex(
start=range_index_meta['start'],
stop=range_index_meta['stop'],
step=range_index_meta['step'],
name=range_index_meta['name']
)
if skip_rows is not None:
range_idx = range_idx[skip_rows:]
if num_rows is not None:
range_idx = range_idx[:num_rows]
else:
actual_index_names = list(index_col_names.values())
names = names[len(actual_index_names):]

data, index = data_from_unique_ptr(
move(c_result.tbl),
names,
actual_index_names
)

data, index = data_from_unique_ptr(move(c_result.tbl), names)
if is_range_index:
index = range_idx
elif reset_index_name:
index.names = [None] * len(index.names)

data = {
name: update_column_struct_field_names(
Expand Down Expand Up @@ -180,6 +237,10 @@ cpdef write_orc(table,
cdef unique_ptr[data_sink] data_sink_c
cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
cdef unique_ptr[table_input_metadata] tbl_meta
cdef map[string, string] user_data
pandas_metadata = generate_pandas_metadata(table, None)

user_data[str.encode("pandas")] = str.encode(pandas_metadata)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

if not isinstance(table._index, cudf.RangeIndex):
tv = table_view_from_table(table)
Expand All @@ -204,8 +265,9 @@ cpdef write_orc(table,

cdef orc_writer_options c_orc_writer_options = move(
orc_writer_options.builder(
sink_info_c, table_view_from_table(table, ignore_index=True)
sink_info_c, tv
).metadata(tbl_meta.get())
.key_value_metadata(move(user_data))
.compression(compression_)
.enable_statistics(_get_orc_stat_freq(statistics))
.build()
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ def test_orc_writer_sliced(tmpdir):
df_select = df.iloc[1:3]

df_select.to_orc(cudf_path)
assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))
assert_eq(cudf.read_orc(cudf_path), df_select)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -792,7 +792,8 @@ def test_orc_bool_encode_fail():

# Also validate data
pdf = pa.orc.ORCFile(buffer).read().to_pandas()
assert_eq(okay_df, pdf)

assert_eq(okay_df.to_pandas(nullable=True), pdf)


def test_nanoseconds_overflow():
Expand Down Expand Up @@ -838,7 +839,12 @@ def test_empty_string_columns(data):
got_df = cudf.read_orc(buffer)

assert_eq(expected, got_df)
assert_eq(expected_pdf, got_df)
assert_eq(
expected_pdf,
got_df.to_pandas(nullable=True)
if expected_pdf["string"].dtype == pd.StringDtype()
else got_df,
)


@pytest.mark.parametrize("scale", [-3, 0, 3])
Expand Down