From 2ba78fb4fa852e8e2b560de9a983b594a1bbe4fa Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Oct 2023 12:31:28 -0700 Subject: [PATCH 1/9] Drop pyorc dependency --- .../all_cuda-118_arch-x86_64.yaml | 1 - .../all_cuda-120_arch-x86_64.yaml | 1 - cpp/tests/io/orc_test.cpp | 14 +- dependencies.yaml | 1 - python/cudf/cudf/_fuzz_testing/orc.py | 19 +- python/cudf/cudf/_fuzz_testing/utils.py | 160 +------- python/cudf/cudf/tests/test_orc.py | 341 ++++++++---------- python/cudf/pyproject.toml | 1 - 8 files changed, 162 insertions(+), 376 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b5782800946..8b6b32bc026 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -70,7 +70,6 @@ dependencies: - ptxcompiler - pyarrow==12.0.1.* - pydata-sphinx-theme -- pyorc - pytest - pytest-benchmark - pytest-cases diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 473b9d07d88..ae15a6e97ab 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -68,7 +68,6 @@ dependencies: - protobuf>=4.21,<5 - pyarrow==12.0.1.* - pydata-sphinx-theme -- pyorc - pytest - pytest-benchmark - pytest-cases diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 890ef914713..3457c5675ad 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1299,20 +1299,16 @@ TEST_F(OrcStatisticsTest, Overflow) TEST_F(OrcStatisticsTest, HasNull) { - // This test can now be implemented with libcudf; keeping the pyorc version to keep the test + // This test can now be implemented with libcudf; keeping the pandas version to keep the test // inputs diversified // Method to create file: - // >>> import pyorc - // >>> output = open("./temp.orc", "wb") - // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt())) - // >>> writer.write((1, 3)) - // >>> writer.write((2, 4)) - // >>> writer.write((None, 5)) - // >>> writer.close() + // >>> import pandas as pd + // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]}) + // >>> df.to_orc("temp.orc") // // Contents of file: // >>> import pyarrow.orc as po - // >>> po.ORCFile('new.orc').read() + // >>> po.ORCFile('temp.orc').read() // pyarrow.Table // a: int64 // b: int64 diff --git a/dependencies.yaml b/dependencies.yaml index c3223e4394d..a7716a15360 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -548,7 +548,6 @@ dependencies: - fastavro>=0.22.9 - hypothesis - mimesis>=4.1.0 - - pyorc - pytest-benchmark - pytest-cases - python-snappy>=0.6.0 diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 65d2e09988f..8634e4eba6c 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import copy import io @@ -6,14 +6,14 @@ import random import numpy as np -import pyorc +import pyarrow.orc +import pyarrow as pa import cudf from cudf._fuzz_testing.io import IOFuzz from cudf._fuzz_testing.utils import ( ALL_POSSIBLE_VALUES, _generate_rand_meta, - pandas_to_orc, pyarrow_to_pandas, ) from cudf.testing import dataset_generator as dg @@ -82,12 +82,7 @@ def generate_input(self): logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df file_obj = io.BytesIO() - pandas_to_orc( - df, - file_io_obj=file_obj, - stripe_size=self._rand(len(df)), - arrow_table_schema=table.schema, - ) + pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df))) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) @@ -109,8 +104,8 @@ def set_rand_params(self, params): ) elif param == "stripes": f = io.BytesIO(self._current_buffer) - reader = pyorc.Reader(f) - stripes = [i for i in range(reader.num_of_stripes)] + orcFile = pa.orc.ORCFile(f) + stripes = [i for i in range(orcFile.nstripes)] params_dict[param] = np.random.choice( [ None, @@ -119,7 +114,7 @@ def set_rand_params(self, params): int, np.unique( np.random.choice( - stripes, reader.num_of_stripes + stripes, orcFile.nstripes ) ), ) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 03418e00cde..0c88c1aeacd 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -1,13 +1,11 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. import random -from collections import OrderedDict import fastavro import numpy as np import pandas as pd import pyarrow as pa -import pyorc import cudf from cudf.testing._utils import assert_eq @@ -41,40 +39,6 @@ cudf.dtype(" can result in incorrect dtype by pandas. - df = df.astype(dtypes) + orc_file = pa.orc.ORCFile(f) + records = [orc_file.read_stripe(i) for i in stripes] + pa_table = pa.Table.from_batches(records) + df = pa_table.to_pandas() return df diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 07aa5430f4f..7ac2c2afb73 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -11,7 +11,6 @@ import pandas as pd import pyarrow as pa import pyarrow.orc -import pyorc import pytest import cudf @@ -295,28 +294,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows): def test_orc_read_skiprows(): buff = BytesIO() - data = [ - True, - False, - True, - False, - None, - True, - True, - True, - False, - None, - False, - False, - True, - True, - True, - True, - ] - writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) - writer.writerows([(d,) for d in data]) - writer.close() - + df = pd.DataFrame( + { + "a": [ + True, + False, + True, + False, + None, + True, + True, + True, + False, + None, + False, + False, + True, + True, + True, + True, + ] + } + ) + df.to_orc(buff) # testing 10 skiprows due to a boolean specific bug fix that didn't # repro for other sizes of data skiprows = 10 @@ -978,44 +978,12 @@ def test_orc_string_stream_offset_issue(): assert_eq(df, cudf.read_orc(buffer)) -# Data is generated using pyorc module def generate_list_struct_buff(size=100_000): rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() - schema = { - "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))), - "lvl1_list": pyorc.Array(pyorc.BigInt()), - "lvl1_struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "lvl2_struct": pyorc.Struct( - **{ - "a": pyorc.BigInt(), - "lvl1_struct": pyorc.Struct( - **{"c": pyorc.BigInt(), "d": pyorc.BigInt()} - ), - } - ), - "list_nests_struct": pyorc.Array( - pyorc.Array( - pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}) - ) - ), - "struct_nests_list": pyorc.Struct( - **{ - "struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "list": pyorc.Array(pyorc.BigInt()), - } - ), - } - - schema = pyorc.Struct(**schema) - lvl3_list = [ rd.choice( [ @@ -1024,37 +992,44 @@ def generate_list_struct_buff(size=100_000): [ [ rd.choice([None, np.random.randint(1, 3)]) - for z in range(np.random.randint(1, 3)) + for _ in range(np.random.randint(1, 3)) ] - for z in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ] - for y in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ], ] ) - for x in range(size) + for _ in range(size) ] lvl1_list = [ [ rd.choice([None, np.random.randint(0, 3)]) - for y in range(np.random.randint(1, 4)) + for _ in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) ] lvl1_struct = [ - rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))]) - for x in range(size) + rd.choice( + [ + None, + {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)}, + ] + ) + for _ in range(size) ] lvl2_struct = [ rd.choice( [ None, - ( - rd.choice([None, np.random.randint(0, 3)]), - ( - rd.choice([None, np.random.randint(0, 3)]), - np.random.randint(0, 3), - ), + *( + {"a": rd.choice([None, np.random.randint(0, 3)])}, + { + "lvl1_struct": { + "c": rd.choice([None, np.random.randint(0, 3)]), + "d": np.random.randint(0, 3), + }, + }, ), ] ) @@ -1062,12 +1037,14 @@ def generate_list_struct_buff(size=100_000): ] list_nests_struct = [ [ - [rd.choice(lvl1_struct), rd.choice(lvl1_struct)] + {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} for y in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) + ] + struct_nests_list = [ + {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size) ] - struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)] df = pd.DataFrame( { @@ -1080,15 +1057,7 @@ def generate_list_struct_buff(size=100_000): } ) - writer = pyorc.Writer(buff, schema, stripe_size=1024) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(df.itertuples(index=False, name=None)), - ) - ) - writer.writerows(tuples) - writer.close() + df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024}) return buff @@ -1160,107 +1129,89 @@ def gen_map_buff(size=10000): buff = BytesIO() - schema = { - "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()), - "lvl2_map": pyorc.Map( - key=pyorc.String(), value=pyorc.Array(pyorc.BigInt()) - ), - "lvl2_struct_map": pyorc.Map( - key=pyorc.String(), - value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}), - ), - } - - schema = pyorc.Struct(**schema) - - lvl1_map = [ - rd.choice( - [ - None, + lvl1_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice([None, np.random.randint(1, 1500)]), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_map = [ - rd.choice( - [ - None, + None, + { + rd.choice(al): rd.choice( + [None, np.random.randint(1, 1500)] + ), + }, + ] + ) + for x in range(size) + ], + type=pa.map_(pa.string(), pa.int64()), + ) + lvl2_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, + None, + *( + { + rd.choice(al): rd.choice( [ - rd.choice( - [None, np.random.randint(1, 1500)] - ) - for z in range(5) - ], - ] - ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_struct_map = [ - rd.choice( - [ - None, + None, + [ + rd.choice( + [None, np.random.randint(1, 1500)] + ) + for z in range(5) + ], + ] + ) + } + for y in range(2) + ), + ] + ) + for x in range(size) + ], + type=pa.map_(pa.string(), pa.list_(pa.int64())), + ) + lvl2_struct_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, - ( - rd.choice( - [None, np.random.randint(1, 1500)] - ), - rd.choice( - [None, np.random.randint(1, 1500)] - ), - ), - ] - ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - - pdf = pd.DataFrame( - { - "lvl1_map": lvl1_map, - "lvl2_map": lvl2_map, - "lvl2_struct_map": lvl2_struct_map, - } + None, + *( + { + rd.choice(al): rd.choice( + [ + None, + { + "a": rd.choice( + [None, np.random.randint(1, 1500)] + ), + "b": rd.choice( + [None, np.random.randint(1, 1500)] + ), + }, + ] + ) + } + for y in range(2) + ), + ] + ) + for x in range(size) + ], + type=pa.map_( + pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) + ), ) - writer = pyorc.Writer( - buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE + + pa_table = pa.Table.from_arrays( + [lvl1_map, lvl2_map, lvl2_struct_map], + ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(pdf.itertuples(index=False, name=None)), - ) + pyarrow.orc.write_table( + pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" ) - writer.writerows(tuples) - writer.close() - return buff @@ -1527,12 +1478,10 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - with pyorc.Writer( - buff, - pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()), - ) as writer: - writer.write((maxint64, minint64, minint64)) - writer.write((1, -1, 1)) + df = pd.DataFrame( + {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} + ) + df.to_orc(buff) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) assert file_stats[0]["a"].get("sum") is None @@ -1546,21 +1495,21 @@ def test_statistics_sum_overflow(): def test_empty_statistics(): buff = BytesIO() - orc_schema = pyorc.Struct( - a=pyorc.BigInt(), - b=pyorc.Double(), - c=pyorc.String(), - d=pyorc.Decimal(11, 2), - e=pyorc.Date(), - f=pyorc.Timestamp(), - g=pyorc.Boolean(), - h=pyorc.Binary(), - i=pyorc.BigInt(), - # One column with non null value, else cudf/pyorc readers crash + pa_table = pa.Table.from_arrays( + [ + pa.array([None], type=pa.int64()), + pa.array([None], type=pa.float64()), + pa.array([None], type=pa.string()), + pa.array([None], type=pa.decimal128(11, 2)), + pa.array([None], type=pa.timestamp("ns")), + pa.array([None], type=pa.date64()), + pa.array([None], type=pa.bool_()), + pa.array([None], type=pa.binary()), + pa.array([1], type=pa.int64()), + ], + ["a", "b", "c", "d", "e", "f", "g", "h", "i"], ) - data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) - with pyorc.Writer(buff, orc_schema) as writer: - writer.write(data) + pyarrow.orc.write_table(pa_table, buff) got = cudf.io.orc.read_orc_statistics([buff]) @@ -1845,10 +1794,10 @@ def negative_timestamp_df(): @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): buffer = BytesIO() - pyorc_table = pa.Table.from_pandas( + orc_table = pa.Table.from_pandas( negative_timestamp_df.to_pandas(), preserve_index=False ) - pyarrow.orc.write_table(pyorc_table, buffer) + pyarrow.orc.write_table(orc_table, buffer) # We warn the user that this function will fall back to the CPU for reading # when the engine is pyarrow. diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 39a8dca0267..90759074750 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -58,7 +58,6 @@ test = [ "hypothesis", "mimesis>=4.1.0", "msgpack", - "pyorc", "pytest", "pytest-benchmark", "pytest-cases", From 4822c245719f7bfc8d7ccfe5a889a2361e5b15ec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Oct 2023 12:42:44 -0700 Subject: [PATCH 2/9] isort --- python/cudf/cudf/_fuzz_testing/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 8634e4eba6c..f79227ac5b2 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -6,8 +6,8 @@ import random import numpy as np -import pyarrow.orc import pyarrow as pa +import pyarrow.orc import cudf from cudf._fuzz_testing.io import IOFuzz From b7dd9516c964d401728234762f2110799c83e1d9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 24 Oct 2023 14:45:11 -0500 Subject: [PATCH 3/9] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/tests/test_orc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 7ac2c2afb73..a2efa6e4b4c 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1033,12 +1033,12 @@ def generate_list_struct_buff(size=100_000): ), ] ) - for x in range(size) + for _ in range(size) ] list_nests_struct = [ [ {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} - for y in range(np.random.randint(1, 4)) + for _ in range(np.random.randint(1, 4)) ] for _ in range(size) ] From f44084d50b6020cdc96540c3c221e0ac7195ffd9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Oct 2023 12:51:33 -0700 Subject: [PATCH 4/9] drop pyarrow.orc imports --- python/cudf/cudf/_fuzz_testing/orc.py | 1 - python/cudf/cudf/tests/test_orc.py | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index f79227ac5b2..4a45dd7a1f7 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -7,7 +7,6 @@ import numpy as np import pyarrow as pa -import pyarrow.orc import cudf from cudf._fuzz_testing.io import IOFuzz diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index a2efa6e4b4c..f4dacf3c48e 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.orc import pytest import cudf @@ -1085,7 +1084,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): use_index=use_index, ) - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() + pyarrow_tbl = pa.orc.ORCFile(list_struct_buff).read() pyarrow_tbl = ( pyarrow_tbl[:num_rows] @@ -1208,7 +1207,7 @@ def gen_map_buff(size=10000): [lvl1_map, lvl2_map, lvl2_struct_map], ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - pyarrow.orc.write_table( + pa.orc.write_table( pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" ) @@ -1439,7 +1438,7 @@ def test_writer_lists_structs(list_struct_buff): buff = BytesIO() df_in.to_orc(buff) - pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() + pyarrow_tbl = pa.orc.ORCFile(buff).read() assert pyarrow_tbl.equals(df_in.to_arrow()) @@ -1509,7 +1508,7 @@ def test_empty_statistics(): ], ["a", "b", "c", "d", "e", "f", "g", "h", "i"], ) - pyarrow.orc.write_table(pa_table, buff) + pa.orc.write_table(pa_table, buff) got = cudf.io.orc.read_orc_statistics([buff]) @@ -1594,8 +1593,8 @@ def test_orc_reader_zstd_compression(list_struct_buff): expected = cudf.read_orc(list_struct_buff) # save with ZSTD compression buffer = BytesIO() - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() - writer = pyarrow.orc.ORCWriter(buffer, compression="zstd") + pyarrow_tbl = pa.orc.ORCFile(list_struct_buff).read() + writer = pa.orc.ORCWriter(buffer, compression="zstd") writer.write(pyarrow_tbl) writer.close() try: @@ -1797,7 +1796,7 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): orc_table = pa.Table.from_pandas( negative_timestamp_df.to_pandas(), preserve_index=False ) - pyarrow.orc.write_table(orc_table, buffer) + pa.orc.write_table(orc_table, buffer) # We warn the user that this function will fall back to the CPU for reading # when the engine is pyarrow. @@ -1812,7 +1811,7 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): negative_timestamp_df.to_orc(buffer) assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, pa.orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): From 13769716faae3e16e5ae209d0388a9320080a88e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Oct 2023 13:03:37 -0700 Subject: [PATCH 5/9] remove unpacking when unnecessary --- python/cudf/cudf/tests/test_orc.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index f4dacf3c48e..cea3e51ef5b 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1021,15 +1021,13 @@ def generate_list_struct_buff(size=100_000): rd.choice( [ None, - *( - {"a": rd.choice([None, np.random.randint(0, 3)])}, - { - "lvl1_struct": { - "c": rd.choice([None, np.random.randint(0, 3)]), - "d": np.random.randint(0, 3), - }, + {"a": rd.choice([None, np.random.randint(0, 3)])}, + { + "lvl1_struct": { + "c": rd.choice([None, np.random.randint(0, 3)]), + "d": np.random.randint(0, 3), }, - ), + }, ] ) for _ in range(size) @@ -1140,7 +1138,7 @@ def gen_map_buff(size=10000): }, ] ) - for x in range(size) + for _ in range(size) ], type=pa.map_(pa.string(), pa.int64()), ) @@ -1158,16 +1156,16 @@ def gen_map_buff(size=10000): rd.choice( [None, np.random.randint(1, 1500)] ) - for z in range(5) + for _ in range(5) ], ] ) } - for y in range(2) + for _ in range(2) ), ] ) - for x in range(size) + for _ in range(size) ], type=pa.map_(pa.string(), pa.list_(pa.int64())), ) @@ -1192,11 +1190,11 @@ def gen_map_buff(size=10000): ] ) } - for y in range(2) + for _ in range(2) ), ] ) - for x in range(size) + for _ in range(size) ], type=pa.map_( pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) From 6b8ba5613b91be192cf96d8c86bed9017f0d50e5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 24 Oct 2023 20:12:01 -0500 Subject: [PATCH 6/9] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/_fuzz_testing/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 4a45dd7a1f7..ecddc72fa85 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -104,7 +104,7 @@ def set_rand_params(self, params): elif param == "stripes": f = io.BytesIO(self._current_buffer) orcFile = pa.orc.ORCFile(f) - stripes = [i for i in range(orcFile.nstripes)] + stripes = list(range(orcFile.nstripes)) params_dict[param] = np.random.choice( [ None, From 9c74a1e8f638e050870db2713f3f7578ee9ea63e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 25 Oct 2023 08:23:09 -0700 Subject: [PATCH 7/9] Fix imports --- python/cudf/cudf/tests/test_orc.py | 63 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index cea3e51ef5b..8c1b4478d7a 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -148,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir): ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"], ) def test_orc_reader_datetimestamp(datadir, inputfile, use_index): + from pyarrow import orc + path = datadir / inputfile try: - orcfile = pa.orc.ORCFile(path) + orcfile = orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) @@ -604,6 +606,8 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed if nrows == 6000000: @@ -622,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): gdf.to_orc(fname.strpath, statistics=stats_freq) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -676,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [2, 100, 6000000]) def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed @@ -728,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True)) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(gdf_fname) + orc_file = orc.ORCFile(gdf_fname) ( file_stats, stripes_stats, @@ -781,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_bool_statistics(tmpdir, datadir, nrows): + from pyarrow import orc + # Make a dataframe gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) fname = tmpdir.join("gdf.orc") @@ -789,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): gdf.to_orc(fname.strpath) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -1075,6 +1083,8 @@ def list_struct_buff(): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000]) @pytest.mark.parametrize("use_index", [True, False]) def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): + from pyarrow import orc + gdf = cudf.read_orc( list_struct_buff, columns=columns, @@ -1082,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): use_index=use_index, ) - pyarrow_tbl = pa.orc.ORCFile(list_struct_buff).read() + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() pyarrow_tbl = ( pyarrow_tbl[:num_rows] @@ -1119,6 +1129,7 @@ def test_pyspark_struct(datadir): def gen_map_buff(size=10000): + from pyarrow import orc from string import ascii_letters as al rd = random.Random(1) @@ -1205,7 +1216,8 @@ def gen_map_buff(size=10000): [lvl1_map, lvl2_map, lvl2_struct_map], ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - pa.orc.write_table( + + orc.write_table( pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" ) @@ -1222,7 +1234,9 @@ def gen_map_buff(size=10000): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000]) @pytest.mark.parametrize("use_index", [True, False]) def test_map_type_read(columns, num_rows, use_index): - tbl = pa.orc.ORCFile(map_buff).read() + from pyarrow import orc + + tbl = orc.read_table(map_buff) lvl1_map = ( tbl["lvl1_map"] @@ -1408,18 +1422,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir): ], ) def test_no_row_group_index_orc_read(datadir, fname): + from pyarrow import orc + fpath = datadir / fname - expect = pa.orc.ORCFile(fpath).read() + expect = orc.ORCFile(fpath).read() got = cudf.read_orc(fpath) assert expect.equals(got.to_arrow()) def test_names_in_struct_dtype_nesting(datadir): + from pyarrow import orc + fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" - expect = pa.orc.ORCFile(fname).read() + expect = orc.ORCFile(fname).read() got = cudf.read_orc(fname) # test dataframes @@ -1431,12 +1449,14 @@ def test_names_in_struct_dtype_nesting(datadir): def test_writer_lists_structs(list_struct_buff): + from pyarrow import orc + df_in = cudf.read_orc(list_struct_buff) buff = BytesIO() df_in.to_orc(buff) - pyarrow_tbl = pa.orc.ORCFile(buff).read() + pyarrow_tbl = orc.ORCFile(buff).read() assert pyarrow_tbl.equals(df_in.to_arrow()) @@ -1491,6 +1511,8 @@ def test_statistics_sum_overflow(): def test_empty_statistics(): + from pyarrow import orc + buff = BytesIO() pa_table = pa.Table.from_arrays( [ @@ -1506,7 +1528,7 @@ def test_empty_statistics(): ], ["a", "b", "c", "d", "e", "f", "g", "h", "i"], ) - pa.orc.write_table(pa_table, buff) + orc.write_table(pa_table, buff) got = cudf.io.orc.read_orc_statistics([buff]) @@ -1561,6 +1583,8 @@ def test_select_nested(list_struct_buff, equivalent_columns): def test_orc_writer_rle_stream_size(datadir, tmpdir): + from pyarrow import orc + original = datadir / "TestOrcFile.int16.rle.size.orc" reencoded = tmpdir.join("int16_map.orc") @@ -1568,7 +1592,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir): df.to_orc(reencoded) # Segfaults when RLE stream sizes don't account for varint length - pa_out = pa.orc.ORCFile(reencoded).read() + pa_out = orc.ORCFile(reencoded).read() assert df.to_arrow().equals(pa_out) @@ -1588,11 +1612,13 @@ def test_empty_columns(): def test_orc_reader_zstd_compression(list_struct_buff): + from pyarrow import orc + expected = cudf.read_orc(list_struct_buff) # save with ZSTD compression buffer = BytesIO() - pyarrow_tbl = pa.orc.ORCFile(list_struct_buff).read() - writer = pa.orc.ORCWriter(buffer, compression="zstd") + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() + writer = orc.ORCWriter(buffer, compression="zstd") writer.write(pyarrow_tbl) writer.close() try: @@ -1791,10 +1817,7 @@ def negative_timestamp_df(): @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): buffer = BytesIO() - orc_table = pa.Table.from_pandas( - negative_timestamp_df.to_pandas(), preserve_index=False - ) - pa.orc.write_table(orc_table, buffer) + negative_timestamp_df.to_orc(buffer) # We warn the user that this function will fall back to the CPU for reading # when the engine is pyarrow. @@ -1805,11 +1828,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): def test_orc_writer_negative_timestamp(negative_timestamp_df): + from pyarrow import orc + buffer = BytesIO() negative_timestamp_df.to_orc(buffer) assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, pa.orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): From 544baa8c497b736d66d709ce7d118ab5356aa752 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 25 Oct 2023 08:51:46 -0700 Subject: [PATCH 8/9] isort --- python/cudf/cudf/tests/test_orc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 8c1b4478d7a..7407da9c4ac 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1129,9 +1129,10 @@ def test_pyspark_struct(datadir): def gen_map_buff(size=10000): - from pyarrow import orc from string import ascii_letters as al + from pyarrow import orc + rd = random.Random(1) np.random.seed(seed=1) From d660d670dc1fd1c5931ceb02d68829aa74ce84cd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 25 Oct 2023 10:14:52 -0700 Subject: [PATCH 9/9] fix warning --- docs/cudf/source/conf.py | 1 + docs/dask_cudf/source/conf.py | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index acb2a5d17f3..28e305b71cb 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -106,6 +106,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index 6861a9b90f6..00568a57431 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -57,6 +57,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True