rapidsai · rapids-bot · Feb 24, 2022 · Jan 29, 2022 · Jan 31, 2022 · Jan 31, 2022
@@ -8,6 +8,12 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from collections import OrderedDict
+
+try:
+    import ujson as json
+except ImportError:
+    import json
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
@@ -123,8 +129,22 @@ cpdef read_orc(object filepaths_or_buffers,
         c_result = move(libcudf_read_orc(c_orc_reader_options))
 
     names = [name.decode() for name in c_result.metadata.column_names]
+    actual_index_names, names, is_range_index, reset_index_name, range_idx = \
+        _get_index_from_metadata(c_result.metadata.user_data,
+                                 names,
+                                 skip_rows,
+                                 num_rows)
+
+    data, index = data_from_unique_ptr(
+        move(c_result.tbl),
+        names,
+        actual_index_names
+    )
 
-    data, index = data_from_unique_ptr(move(c_result.tbl), names)
+    if is_range_index:
+        index = range_idx
+    elif reset_index_name:
+        index.names = [None] * len(index.names)
 
     data = {
         name: update_column_struct_field_names(
@@ -144,6 +164,60 @@ cdef compression_type _get_comp_type(object compression):
     else:
         raise ValueError(f"Unsupported `compression` type {compression}")
 
+cdef tuple _get_index_from_metadata(
+        map[string, string] user_data,
+        object names,
+        object skip_rows,
+        object num_rows):
+    json_str = user_data[b'pandas'].decode('utf-8')
+    meta = None
+    index_col = None
+    is_range_index = False
+    reset_index_name = False
+    range_idx = None
+    if json_str != "":
+        meta = json.loads(json_str)
+
+        if 'index_columns' in meta and len(meta['index_columns']) > 0:
+            index_col = meta['index_columns']
+            if isinstance(index_col[0], dict) and \
+                    index_col[0]['kind'] == 'range':
+                is_range_index = True
+            else:
+                index_col_names = OrderedDict()
+                for idx_col in index_col:
+                    for c in meta['columns']:
+                        if c['field_name'] == idx_col:
+                            index_col_names[idx_col] = \
+                                c['name'] or c['field_name']
+                            if c['name'] is None:
+                                reset_index_name = True
+
+    actual_index_names = None
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            range_index_meta = index_col[0]
+            range_idx = cudf.RangeIndex(
+                start=range_index_meta['start'],
+                stop=range_index_meta['stop'],
+                step=range_index_meta['step'],
+                name=range_index_meta['name']
+            )
+            if skip_rows is not None:
+                range_idx = range_idx[skip_rows:]
+            if num_rows is not None:
+                range_idx = range_idx[:num_rows]
+        else:
+            actual_index_names = list(index_col_names.values())
+            names = names[len(actual_index_names):]
+
+    return (
+        actual_index_names,
+        names,
+        is_range_index,
+        reset_index_name,
+        range_idx
+    )
 
 cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
     """
@@ -180,6 +254,10 @@ cpdef write_orc(table,
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
     cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef map[string, string] user_data
+    user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
+        table, None)
+    )
 
     if not isinstance(table._index, cudf.RangeIndex):
         tv = table_view_from_table(table)
@@ -204,8 +282,9 @@ cpdef write_orc(table,
 
     cdef orc_writer_options c_orc_writer_options = move(
         orc_writer_options.builder(
-            sink_info_c, table_view_from_table(table, ignore_index=True)
+            sink_info_c, tv
         ).metadata(tbl_meta.get())
+        .key_value_metadata(move(user_data))
         .compression(compression_)
         .enable_statistics(_get_orc_stat_freq(statistics))
         .build()

@@ -552,7 +552,7 @@ def test_orc_writer_sliced(tmpdir):
     df_select = df.iloc[1:3]
 
     df_select.to_orc(cudf_path)
-    assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))
+    assert_eq(cudf.read_orc(cudf_path), df_select)
 
 
 @pytest.mark.parametrize(
@@ -794,7 +794,8 @@ def test_orc_bool_encode_fail():
 
     # Also validate data
     pdf = pa.orc.ORCFile(buffer).read().to_pandas()
-    assert_eq(okay_df, pdf)
+
+    assert_eq(okay_df.to_pandas(nullable=True), pdf)
 
 
 def test_nanoseconds_overflow():
@@ -840,7 +841,12 @@ def test_empty_string_columns(data):
     got_df = cudf.read_orc(buffer)
 
     assert_eq(expected, got_df)
-    assert_eq(expected_pdf, got_df)
+    assert_eq(
+        expected_pdf,
+        got_df.to_pandas(nullable=True)
+        if expected_pdf["string"].dtype == pd.StringDtype()
+        else got_df,
+    )
 
 
 @pytest.mark.parametrize("scale", [-3, 0, 3])