From 708e5a7b3cea439d79fa5c8dd7068c873ed2b282 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 9 Nov 2022 05:59:45 -0800 Subject: [PATCH 1/3] First pass of read_orc changes --- python/cudf/cudf/testing/_utils.py | 10 ++ python/cudf/cudf/tests/test_gcs.py | 6 +- python/cudf/cudf/tests/test_hdfs.py | 7 +- python/cudf/cudf/tests/test_orc.py | 193 ++++++++-------------------- python/cudf/cudf/tests/test_s3.py | 10 +- 5 files changed, 70 insertions(+), 156 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 259257c257f..a0e25ba1f1b 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -10,11 +10,14 @@ import cupy import numpy as np import pandas as pd +import pyarrow as pa +import pyarrow.orc import pytest from pandas import testing as tm import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes +from cudf.core._compat import PANDAS_GE_100 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils import dtypes as dtypeutils @@ -379,6 +382,13 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) +def _pandas_read_orc(fname, columns=None): + if PANDAS_GE_100: + return pd.read_orc(fname, columns=columns) + else: + return pa.orc.ORCFile(fname).read(columns=columns).to_pandas() + + parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index f15d705c4e2..bfcbf2ccee8 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -5,12 +5,10 @@ import numpy as np import pandas as pd -import pyarrow as pa -import pyarrow.orc import pytest import cudf -from cudf.testing._utils import assert_eq +from cudf.testing._utils import _pandas_read_orc, assert_eq gcsfs = pytest.importorskip("gcsfs") @@ -71,5 +69,5 @@ def mock_open(*args, **kwargs): monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) gdf.to_orc(f"gcs://{gcs_fname}") - got = pa.orc.ORCFile(local_filepath).read().to_pandas() + got = _pandas_read_orc(local_filepath) assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index 8730cb187b5..709e5a0af29 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -8,10 +8,9 @@ import pandas as pd import pyarrow as pa import pytest -from pyarrow import orc import cudf -from cudf.testing._utils import assert_eq +from cudf.testing._utils import _pandas_read_orc, assert_eq if not os.environ.get("RUN_HDFS_TESTS"): pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") @@ -212,7 +211,7 @@ def test_read_orc(datadir, hdfs, test_url): hd_fpath = f"hdfs://{basedir}/file.orc" got = cudf.read_orc(hd_fpath) - expect = orc.ORCFile(buffer).read().to_pandas() + expect = _pandas_read_orc(buffer) assert_eq(expect, got) @@ -232,7 +231,7 @@ def test_write_orc(pdf, hdfs, test_url): assert hdfs.exists(f"{basedir}/test_orc_writer.orc") with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f: - got = orc.ORCFile(f).read().to_pandas() + got = _pandas_read_orc(f) assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index fbd9b83330e..1f27693253c 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -18,6 +18,7 @@ from cudf.io.orc import ORCWriter from cudf.testing import assert_frame_equal from cudf.testing._utils import ( + _pandas_read_orc, assert_eq, gen_rand_series, supported_numpy_dtypes, @@ -84,12 +85,8 @@ def _make_path_or_buf(src): ) def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path = datadir / inputfile - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - expect = orcfile.read(columns=columns).to_pandas() + expect = _pandas_read_orc(path, columns=columns) got = cudf.read_orc( path, engine=engine, columns=columns, use_index=use_index ) @@ -119,8 +116,7 @@ def test_orc_reader_local_filepath(): def test_orc_reader_filepath_or_buffer(path_or_buf, src): cols = ["int1", "long1", "float1", "double1"] - orcfile = pa.orc.ORCFile(path_or_buf("filepath")) - expect = orcfile.read(columns=cols).to_pandas() + expect = _pandas_read_orc(path_or_buf("filepath"), columns=cols) got = cudf.read_orc(path_or_buf(src), columns=cols) assert_eq(expect, got) @@ -128,12 +124,8 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src): def test_orc_reader_trailing_nulls(datadir): path = datadir / "TestOrcFile.nulls-at-end-snappy.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - expect = orcfile.read().to_pandas().fillna(0) + expect = _pandas_read_orc(path).fillna(0) got = cudf.read_orc(path).fillna(0) # PANDAS uses NaN to represent invalid data, which forces float dtype @@ -164,12 +156,8 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index): def test_orc_reader_strings(datadir): path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - expect = orcfile.read(columns=["string1"]) + expect = _pandas_read_orc(path, columns=["string1"]) got = cudf.read_orc(path, columns=["string1"]) assert_eq(expect, got, check_categorical=False) @@ -285,12 +273,8 @@ def test_orc_read_stripes(datadir, engine): @pytest.mark.parametrize("skiprows", [0, 1, 3000]) def test_orc_read_rows(datadir, skiprows, num_rows): path = datadir / "TestOrcFile.decimal.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - pdf = orcfile.read().to_pandas() + pdf = _pandas_read_orc(path) gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows) # Slice rows out of the whole dataframe for comparison as PyArrow doesn't @@ -329,19 +313,15 @@ def test_orc_read_skiprows(): # repro for other sizes of data skiprows = 10 - expected = cudf.read_orc(buff)[skiprows:].reset_index(drop=True) + expected = _pandas_read_orc(buff)[skiprows:].reset_index(drop=True) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got) def test_orc_reader_uncompressed_block(datadir): path = datadir / "uncompressed_snappy.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - expect = orcfile.read().to_pandas() + expect = _pandas_read_orc(path) got = cudf.read_orc(path) assert_eq(expect, got, check_categorical=False) @@ -349,15 +329,8 @@ def test_orc_reader_uncompressed_block(datadir): def test_orc_reader_nodata_block(datadir): path = datadir / "nodata.orc" - try: - orcfile = pa.orc.ORCFile(path) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - expect = orcfile.read().to_pandas() + expect = _pandas_read_orc(path) got = cudf.read_orc(path, num_rows=1) assert_eq(expect, got, check_categorical=False) @@ -386,19 +359,9 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") - try: - orcfile = pa.orc.ORCFile(pdf_fname) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - - expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) + expect = cudf.from_pandas(_pandas_read_orc(pdf_fname, columns=columns)) expect.to_orc(gdf_fname.strpath, compression=compression) - got = cudf.from_pandas( - pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() - ) + got = cudf.from_pandas(_pandas_read_orc(gdf_fname, columns=columns)) assert_frame_equal(expect, got) @@ -409,17 +372,9 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") - try: - orcfile = pa.orc.ORCFile(pdf_fname) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - - expect = cudf.from_pandas(orcfile.read().to_pandas()) + expect = cudf.from_pandas(_pandas_read_orc(pdf_fname)) expect.to_orc(gdf_fname.strpath, statistics=stats_freq) - got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) + got = cudf.from_pandas(_pandas_read_orc(gdf_fname)) assert_frame_equal(expect, got) @@ -430,14 +385,6 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") - try: - orcfile = pa.orc.ORCFile(pdf_fname) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - columns = [ "boolean1", "byte1", @@ -447,7 +394,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): "float1", "double1", ] - pdf = orcfile.read(columns=columns).to_pandas() + pdf = _pandas_read_orc(pdf_fname, columns=columns) gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) @@ -456,7 +403,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): writer.write_table(gdf) writer.close() - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + got = _pandas_read_orc(gdf_fname) assert_eq(expect, got) @@ -486,15 +433,7 @@ def test_chunked_orc_writer( pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") - try: - orcfile = pa.orc.ORCFile(pdf_fname) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - - pdf = orcfile.read(columns=columns).to_pandas() + pdf = _pandas_read_orc(pdf_fname, columns=columns) gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) @@ -503,7 +442,7 @@ def test_chunked_orc_writer( writer.write_table(gdf) writer.close() - got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + got = _pandas_read_orc(gdf_fname, columns=columns) assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @@ -521,7 +460,7 @@ def test_orc_writer_strings(tmpdir, dtypes): expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) expect.to_orc(gdf_fname) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + got = _pandas_read_orc(gdf_fname) assert_eq(expect, got) @@ -546,7 +485,7 @@ def test_chunked_orc_writer_strings(tmpdir, dtypes): writer.write_table(gdf) writer.close() - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + got = _pandas_read_orc(gdf_fname) assert_eq(expect, got) @@ -577,13 +516,8 @@ def test_orc_writer_sliced(tmpdir): def test_orc_reader_decimal_type(datadir, orc_file): file_path = datadir / orc_file - try: - orcfile = pa.orc.ORCFile(file_path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - pdf = orcfile.read().to_pandas() - df = cudf.read_orc(file_path).to_pandas() + pdf = _pandas_read_orc(file_path) + df = cudf.read_orc(file_path) assert_eq(pdf, df) @@ -591,13 +525,8 @@ def test_orc_reader_decimal_type(datadir, orc_file): def test_orc_decimal_precision_fail(datadir): file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc" - try: - orcfile = pa.orc.ORCFile(file_path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - # Shouldn't cause failure if decimal column is not chosen to be read. - pdf = orcfile.read(columns=["int"]).to_pandas() + pdf = _pandas_read_orc(file_path, columns=["int"]) gdf = cudf.read_orc(file_path, columns=["int"]) assert_eq(pdf, gdf) @@ -614,7 +543,7 @@ def test_orc_decimal_precision_fail(datadir): def test_orc_reader_boolean_type(datadir, orc_file): file_path = datadir / orc_file - pdf = pd.read_orc(file_path) + pdf = _pandas_read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() assert_eq(pdf, df) @@ -624,13 +553,9 @@ def test_orc_reader_tzif_timestamps(datadir): # Contains timstamps in the range covered by the TZif file # Other timedate tests only cover "future" times path = datadir / "TestOrcFile.lima_timezone.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path).to_pandas() + pdf = _pandas_read_orc(path) + gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -882,13 +807,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): def test_orc_reader_gmt_timestamps(datadir): path = datadir / "TestOrcFile.gmt.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path).to_pandas() + pdf = _pandas_read_orc(path) + gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -914,7 +835,7 @@ def test_orc_bool_encode_fail(): okay_df.to_orc(buffer) # Also validate data - pdf = pa.orc.ORCFile(buffer).read().to_pandas() + pdf = _pandas_read_orc(buffer) assert_eq(okay_df.to_pandas(nullable=True), pdf) @@ -929,8 +850,8 @@ def test_nanoseconds_overflow(): cudf_got = cudf.read_orc(buffer) assert_eq(expected, cudf_got) - pyarrow_got = pa.orc.ORCFile(buffer).read() - assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) + pandas_got = _pandas_read_orc(buffer) + assert_eq(expected, pandas_got) def test_empty_dataframe(): @@ -943,7 +864,7 @@ def test_empty_dataframe(): cudf.read_orc(buffer, columns=["a"]) got_df = cudf.read_orc(buffer) - expected_pdf = pd.read_orc(buffer) + expected_pdf = _pandas_read_orc(buffer) assert_eq(expected, got_df) assert_eq(expected_pdf, got_df) @@ -958,7 +879,7 @@ def test_empty_string_columns(data): expected = cudf.DataFrame({"string": data}, dtype="str") expected.to_orc(buffer) - expected_pdf = pd.read_orc(buffer) + expected_pdf = _pandas_read_orc(buffer) got_df = cudf.read_orc(buffer) assert_eq(expected, got_df) @@ -984,7 +905,7 @@ def test_orc_writer_decimal(tmpdir, scale, decimal_type): expected.to_orc(fname) - got = pd.read_orc(fname) + got = _pandas_read_orc(fname) assert_eq(expected.to_pandas()["dec_val"], got["dec_val"]) @@ -993,8 +914,8 @@ def test_orc_reader_multiple_files(datadir, num_rows): path = datadir / "TestOrcFile.testSnappy.orc" - df_1 = pd.read_orc(path) - df_2 = pd.read_orc(path) + df_1 = _pandas_read_orc(path) + df_2 = _pandas_read_orc(path) df = pd.concat([df_1, df_2], ignore_index=True) gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas() @@ -1020,7 +941,7 @@ def test_orc_reader_multi_file_multi_stripe(datadir): path = datadir / "TestOrcFile.testStripeLevelStats.orc" gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]]) - pdf = pd.read_orc(path) + pdf = _pandas_read_orc(path) assert_eq(pdf, gdf) @@ -1207,7 +1128,7 @@ def test_skip_rows_for_nested_types(columns, list_struct_buff): def test_pyspark_struct(datadir): path = datadir / "TestOrcFile.testPySparkStruct.orc" - pdf = pa.orc.ORCFile(path).read().to_pandas() + pdf = _pandas_read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1391,13 +1312,9 @@ def test_map_type_read(columns, num_rows, use_index): def test_orc_reader_decimal(datadir): path = datadir / "TestOrcFile.decimal.orc" - try: - orcfile = pa.orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - pdf = orcfile.read().to_pandas() - gdf = cudf.read_orc(path).to_pandas() + pdf = _pandas_read_orc(path) + gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1407,7 +1324,7 @@ def test_orc_reader_decimal(datadir): def test_orc_timestamp_read(datadir): path = datadir / "TestOrcFile.timestamp.issue.orc" - pdf = pd.read_orc(path) + pdf = _pandas_read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1478,7 +1395,7 @@ def test_orc_writer_lists(data): buffer, stripe_size_rows=2048, row_index_stride=512 ) - pdf_out = pa.orc.ORCFile(buffer).read().to_pandas() + pdf_out = _pandas_read_orc(buffer) assert_eq(pdf_out, pdf_in) @@ -1500,7 +1417,7 @@ def test_chunked_orc_writer_lists(): writer.write_table(gdf) writer.close() - got = pa.orc.ORCFile(buffer).read().to_pandas() + got = _pandas_read_orc(buffer) assert_eq(expect, got) @@ -1508,17 +1425,9 @@ def test_writer_timestamp_stream_size(datadir, tmpdir): pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc" gdf_fname = tmpdir.join("gdf.orc") - try: - orcfile = pa.orc.ORCFile(pdf_fname) - except Exception as excpr: - if type(excpr).__name__ == "ArrowIOError": - pytest.skip(".orc file is not found") - else: - print(type(excpr).__name__) - - expect = orcfile.read().to_pandas() + expect = _pandas_read_orc(pdf_fname) cudf.from_pandas(expect).to_orc(gdf_fname.strpath) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + got = _pandas_read_orc(gdf_fname) assert_eq(expect, got) @@ -1591,7 +1500,7 @@ def test_orc_writer_lists_empty_rg(data): df = cudf.read_orc(buffer) assert_eq(df, cudf_in) - pdf_out = pa.orc.ORCFile(buffer).read().to_pandas() + pdf_out = _pandas_read_orc(buffer) assert_eq(pdf_in, pdf_out) @@ -1696,7 +1605,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir): # Segfaults when RLE stream sizes don't account for varint length pa_out = pa.orc.ORCFile(reencoded).read() - assert_eq(df.to_pandas(), pa_out) + assert df.to_arrow().equals(pa_out) def test_empty_columns(): @@ -1755,7 +1664,7 @@ def test_orc_writer_nvcomp(compression): except RuntimeError: pytest.mark.xfail(reason="Newer nvCOMP version is required") else: - got = pd.read_orc(buff) + got = _pandas_read_orc(buff) assert_eq(expected, got) @@ -1781,7 +1690,7 @@ def test_orc_columns_and_index_param(index_obj, index, columns): ) df.to_orc(buffer, index=index) - expected = pd.read_orc(buffer, columns=columns) + expected = _pandas_read_orc(buffer, columns=columns) got = cudf.read_orc(buffer, columns=columns) if columns: @@ -1838,7 +1747,7 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): buffer = BytesIO() df.to_orc(buffer, cols_as_map_type=cols_as_map_type) - got = pd.read_orc(buffer) + got = _pandas_read_orc(buffer) expected = pd.DataFrame(expected_data) assert_eq(got, expected) @@ -1884,14 +1793,14 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): buffer = BytesIO() negative_timestamp_df.to_orc(buffer) - assert_eq(negative_timestamp_df, pd.read_orc(buffer)) + assert_eq(negative_timestamp_df, _pandas_read_orc(buffer)) assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): path = datadir / "TestOrcFile.apache_timestamp.orc" - pdf = pd.read_orc(path) + pdf = _pandas_read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index d2339930b91..7cca776ae64 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -7,14 +7,12 @@ import numpy as np import pandas as pd -import pyarrow as pa import pyarrow.fs as pa_fs -import pyarrow.orc import pytest from fsspec.core import get_fs_token_paths import cudf -from cudf.testing._utils import assert_eq +from cudf.testing._utils import _pandas_read_orc, assert_eq moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -442,7 +440,7 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bucket = "orc" - expect = pa.orc.ORCFile(source_file).read().to_pandas() + expect = _pandas_read_orc(source_file) with open(source_file, "rb") as f: buffer = f.read() @@ -465,7 +463,7 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bucket = "orc" - expect = pa.orc.ORCFile(source_file).read().to_pandas() + expect = _pandas_read_orc(source_file) with open(source_file, "rb") as f: buffer = f.read() @@ -491,7 +489,7 @@ def test_write_orc(s3_base, s3so, pdf): assert s3fs.exists(f"s3://{bucket}/{fname}") with s3fs.open(f"s3://{bucket}/{fname}") as f: - got = pa.orc.ORCFile(f).read().to_pandas() + got = _pandas_read_orc(f) assert_eq(pdf, got) From 51837947c6a373c24f12994b20600883fd27dd36 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 9 Nov 2022 06:15:21 -0800 Subject: [PATCH 2/3] fix --- python/cudf/cudf/tests/test_orc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1f27693253c..6dfb756eec9 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -313,7 +313,9 @@ def test_orc_read_skiprows(): # repro for other sizes of data skiprows = 10 - expected = _pandas_read_orc(buff)[skiprows:].reset_index(drop=True) + expected = ( + _pandas_read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") + ) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got) From ec435243a7238a43b640e884713bab05d497040e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 10 Nov 2022 05:03:17 -0800 Subject: [PATCH 3/3] Drop less than Pandas-1.0 --- python/cudf/cudf/core/_compat.py | 1 - python/cudf/cudf/testing/_utils.py | 10 ---- python/cudf/cudf/tests/test_gcs.py | 4 +- python/cudf/cudf/tests/test_hdfs.py | 6 +- python/cudf/cudf/tests/test_orc.py | 89 ++++++++++++++--------------- python/cudf/cudf/tests/test_s3.py | 8 +-- 6 files changed, 53 insertions(+), 65 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index 5534d732f53..3889fcc4cc0 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -4,7 +4,6 @@ from packaging import version PANDAS_VERSION = version.parse(pd.__version__) -PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0") PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1") PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2") PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2") diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index a0e25ba1f1b..259257c257f 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -10,14 +10,11 @@ import cupy import numpy as np import pandas as pd -import pyarrow as pa -import pyarrow.orc import pytest from pandas import testing as tm import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf.core._compat import PANDAS_GE_100 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils import dtypes as dtypeutils @@ -382,13 +379,6 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) -def _pandas_read_orc(fname, columns=None): - if PANDAS_GE_100: - return pd.read_orc(fname, columns=columns) - else: - return pa.orc.ORCFile(fname).read(columns=columns).to_pandas() - - parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index bfcbf2ccee8..a677ace18ec 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.testing._utils import _pandas_read_orc, assert_eq +from cudf.testing._utils import assert_eq gcsfs = pytest.importorskip("gcsfs") @@ -69,5 +69,5 @@ def mock_open(*args, **kwargs): monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) gdf.to_orc(f"gcs://{gcs_fname}") - got = _pandas_read_orc(local_filepath) + got = pd.read_orc(local_filepath) assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index 709e5a0af29..f8de16f8609 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -10,7 +10,7 @@ import pytest import cudf -from cudf.testing._utils import _pandas_read_orc, assert_eq +from cudf.testing._utils import assert_eq if not os.environ.get("RUN_HDFS_TESTS"): pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") @@ -211,7 +211,7 @@ def test_read_orc(datadir, hdfs, test_url): hd_fpath = f"hdfs://{basedir}/file.orc" got = cudf.read_orc(hd_fpath) - expect = _pandas_read_orc(buffer) + expect = pd.read_orc(buffer) assert_eq(expect, got) @@ -231,7 +231,7 @@ def test_write_orc(pdf, hdfs, test_url): assert hdfs.exists(f"{basedir}/test_orc_writer.orc") with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f: - got = _pandas_read_orc(f) + got = pd.read_orc(f) assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 6dfb756eec9..1699c11617a 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -18,7 +18,6 @@ from cudf.io.orc import ORCWriter from cudf.testing import assert_frame_equal from cudf.testing._utils import ( - _pandas_read_orc, assert_eq, gen_rand_series, supported_numpy_dtypes, @@ -86,7 +85,7 @@ def _make_path_or_buf(src): def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path = datadir / inputfile - expect = _pandas_read_orc(path, columns=columns) + expect = pd.read_orc(path, columns=columns) got = cudf.read_orc( path, engine=engine, columns=columns, use_index=use_index ) @@ -116,7 +115,7 @@ def test_orc_reader_local_filepath(): def test_orc_reader_filepath_or_buffer(path_or_buf, src): cols = ["int1", "long1", "float1", "double1"] - expect = _pandas_read_orc(path_or_buf("filepath"), columns=cols) + expect = pd.read_orc(path_or_buf("filepath"), columns=cols) got = cudf.read_orc(path_or_buf(src), columns=cols) assert_eq(expect, got) @@ -125,7 +124,7 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src): def test_orc_reader_trailing_nulls(datadir): path = datadir / "TestOrcFile.nulls-at-end-snappy.orc" - expect = _pandas_read_orc(path).fillna(0) + expect = pd.read_orc(path).fillna(0) got = cudf.read_orc(path).fillna(0) # PANDAS uses NaN to represent invalid data, which forces float dtype @@ -157,7 +156,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index): def test_orc_reader_strings(datadir): path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc" - expect = _pandas_read_orc(path, columns=["string1"]) + expect = pd.read_orc(path, columns=["string1"]) got = cudf.read_orc(path, columns=["string1"]) assert_eq(expect, got, check_categorical=False) @@ -274,7 +273,7 @@ def test_orc_read_stripes(datadir, engine): def test_orc_read_rows(datadir, skiprows, num_rows): path = datadir / "TestOrcFile.decimal.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows) # Slice rows out of the whole dataframe for comparison as PyArrow doesn't @@ -314,7 +313,7 @@ def test_orc_read_skiprows(): skiprows = 10 expected = ( - _pandas_read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") + pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") ) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got) @@ -323,7 +322,7 @@ def test_orc_read_skiprows(): def test_orc_reader_uncompressed_block(datadir): path = datadir / "uncompressed_snappy.orc" - expect = _pandas_read_orc(path) + expect = pd.read_orc(path) got = cudf.read_orc(path) assert_eq(expect, got, check_categorical=False) @@ -332,7 +331,7 @@ def test_orc_reader_uncompressed_block(datadir): def test_orc_reader_nodata_block(datadir): path = datadir / "nodata.orc" - expect = _pandas_read_orc(path) + expect = pd.read_orc(path) got = cudf.read_orc(path, num_rows=1) assert_eq(expect, got, check_categorical=False) @@ -361,9 +360,9 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") - expect = cudf.from_pandas(_pandas_read_orc(pdf_fname, columns=columns)) + expect = cudf.from_pandas(pd.read_orc(pdf_fname, columns=columns)) expect.to_orc(gdf_fname.strpath, compression=compression) - got = cudf.from_pandas(_pandas_read_orc(gdf_fname, columns=columns)) + got = cudf.from_pandas(pd.read_orc(gdf_fname, columns=columns)) assert_frame_equal(expect, got) @@ -374,9 +373,9 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("gdf.orc") - expect = cudf.from_pandas(_pandas_read_orc(pdf_fname)) + expect = cudf.from_pandas(pd.read_orc(pdf_fname)) expect.to_orc(gdf_fname.strpath, statistics=stats_freq) - got = cudf.from_pandas(_pandas_read_orc(gdf_fname)) + got = cudf.from_pandas(pd.read_orc(gdf_fname)) assert_frame_equal(expect, got) @@ -396,7 +395,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): "float1", "double1", ] - pdf = _pandas_read_orc(pdf_fname, columns=columns) + pdf = pd.read_orc(pdf_fname, columns=columns) gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) @@ -405,7 +404,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): writer.write_table(gdf) writer.close() - got = _pandas_read_orc(gdf_fname) + got = pd.read_orc(gdf_fname) assert_eq(expect, got) @@ -435,7 +434,7 @@ def test_chunked_orc_writer( pdf_fname = datadir / reference_file gdf_fname = tmpdir.join("chunked_gdf.orc") - pdf = _pandas_read_orc(pdf_fname, columns=columns) + pdf = pd.read_orc(pdf_fname, columns=columns) gdf = cudf.from_pandas(pdf) expect = pd.concat([pdf, pdf]).reset_index(drop=True) @@ -444,7 +443,7 @@ def test_chunked_orc_writer( writer.write_table(gdf) writer.close() - got = _pandas_read_orc(gdf_fname, columns=columns) + got = pd.read_orc(gdf_fname, columns=columns) assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @@ -462,7 +461,7 @@ def test_orc_writer_strings(tmpdir, dtypes): expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) expect.to_orc(gdf_fname) - got = _pandas_read_orc(gdf_fname) + got = pd.read_orc(gdf_fname) assert_eq(expect, got) @@ -487,7 +486,7 @@ def test_chunked_orc_writer_strings(tmpdir, dtypes): writer.write_table(gdf) writer.close() - got = _pandas_read_orc(gdf_fname) + got = pd.read_orc(gdf_fname) assert_eq(expect, got) @@ -518,7 +517,7 @@ def test_orc_writer_sliced(tmpdir): def test_orc_reader_decimal_type(datadir, orc_file): file_path = datadir / orc_file - pdf = _pandas_read_orc(file_path) + pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path) assert_eq(pdf, df) @@ -528,7 +527,7 @@ def test_orc_decimal_precision_fail(datadir): file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc" # Shouldn't cause failure if decimal column is not chosen to be read. - pdf = _pandas_read_orc(file_path, columns=["int"]) + pdf = pd.read_orc(file_path, columns=["int"]) gdf = cudf.read_orc(file_path, columns=["int"]) assert_eq(pdf, gdf) @@ -545,7 +544,7 @@ def test_orc_decimal_precision_fail(datadir): def test_orc_reader_boolean_type(datadir, orc_file): file_path = datadir / orc_file - pdf = _pandas_read_orc(file_path) + pdf = pd.read_orc(file_path) df = cudf.read_orc(file_path).to_pandas() assert_eq(pdf, df) @@ -556,7 +555,7 @@ def test_orc_reader_tzif_timestamps(datadir): # Other timedate tests only cover "future" times path = datadir / "TestOrcFile.lima_timezone.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -810,7 +809,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): def test_orc_reader_gmt_timestamps(datadir): path = datadir / "TestOrcFile.gmt.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -837,7 +836,7 @@ def test_orc_bool_encode_fail(): okay_df.to_orc(buffer) # Also validate data - pdf = _pandas_read_orc(buffer) + pdf = pd.read_orc(buffer) assert_eq(okay_df.to_pandas(nullable=True), pdf) @@ -852,7 +851,7 @@ def test_nanoseconds_overflow(): cudf_got = cudf.read_orc(buffer) assert_eq(expected, cudf_got) - pandas_got = _pandas_read_orc(buffer) + pandas_got = pd.read_orc(buffer) assert_eq(expected, pandas_got) @@ -866,7 +865,7 @@ def test_empty_dataframe(): cudf.read_orc(buffer, columns=["a"]) got_df = cudf.read_orc(buffer) - expected_pdf = _pandas_read_orc(buffer) + expected_pdf = pd.read_orc(buffer) assert_eq(expected, got_df) assert_eq(expected_pdf, got_df) @@ -881,7 +880,7 @@ def test_empty_string_columns(data): expected = cudf.DataFrame({"string": data}, dtype="str") expected.to_orc(buffer) - expected_pdf = _pandas_read_orc(buffer) + expected_pdf = pd.read_orc(buffer) got_df = cudf.read_orc(buffer) assert_eq(expected, got_df) @@ -907,7 +906,7 @@ def test_orc_writer_decimal(tmpdir, scale, decimal_type): expected.to_orc(fname) - got = _pandas_read_orc(fname) + got = pd.read_orc(fname) assert_eq(expected.to_pandas()["dec_val"], got["dec_val"]) @@ -916,8 +915,8 @@ def test_orc_reader_multiple_files(datadir, num_rows): path = datadir / "TestOrcFile.testSnappy.orc" - df_1 = _pandas_read_orc(path) - df_2 = _pandas_read_orc(path) + df_1 = pd.read_orc(path) + df_2 = pd.read_orc(path) df = pd.concat([df_1, df_2], ignore_index=True) gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas() @@ -943,7 +942,7 @@ def test_orc_reader_multi_file_multi_stripe(datadir): path = datadir / "TestOrcFile.testStripeLevelStats.orc" gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]]) - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) assert_eq(pdf, gdf) @@ -1130,7 +1129,7 @@ def test_skip_rows_for_nested_types(columns, list_struct_buff): def test_pyspark_struct(datadir): path = datadir / "TestOrcFile.testPySparkStruct.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1315,7 +1314,7 @@ def test_map_type_read(columns, num_rows, use_index): def test_orc_reader_decimal(datadir): path = datadir / "TestOrcFile.decimal.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1326,7 +1325,7 @@ def test_orc_reader_decimal(datadir): def test_orc_timestamp_read(datadir): path = datadir / "TestOrcFile.timestamp.issue.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) @@ -1397,7 +1396,7 @@ def test_orc_writer_lists(data): buffer, stripe_size_rows=2048, row_index_stride=512 ) - pdf_out = _pandas_read_orc(buffer) + pdf_out = pd.read_orc(buffer) assert_eq(pdf_out, pdf_in) @@ -1419,7 +1418,7 @@ def test_chunked_orc_writer_lists(): writer.write_table(gdf) writer.close() - got = _pandas_read_orc(buffer) + got = pd.read_orc(buffer) assert_eq(expect, got) @@ -1427,9 +1426,9 @@ def test_writer_timestamp_stream_size(datadir, tmpdir): pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc" gdf_fname = tmpdir.join("gdf.orc") - expect = _pandas_read_orc(pdf_fname) + expect = pd.read_orc(pdf_fname) cudf.from_pandas(expect).to_orc(gdf_fname.strpath) - got = _pandas_read_orc(gdf_fname) + got = pd.read_orc(gdf_fname) assert_eq(expect, got) @@ -1502,7 +1501,7 @@ def test_orc_writer_lists_empty_rg(data): df = cudf.read_orc(buffer) assert_eq(df, cudf_in) - pdf_out = _pandas_read_orc(buffer) + pdf_out = pd.read_orc(buffer) assert_eq(pdf_in, pdf_out) @@ -1666,7 +1665,7 @@ def test_orc_writer_nvcomp(compression): except RuntimeError: pytest.mark.xfail(reason="Newer nvCOMP version is required") else: - got = _pandas_read_orc(buff) + got = pd.read_orc(buff) assert_eq(expected, got) @@ -1692,7 +1691,7 @@ def test_orc_columns_and_index_param(index_obj, index, columns): ) df.to_orc(buffer, index=index) - expected = _pandas_read_orc(buffer, columns=columns) + expected = pd.read_orc(buffer, columns=columns) got = cudf.read_orc(buffer, columns=columns) if columns: @@ -1749,7 +1748,7 @@ def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): buffer = BytesIO() df.to_orc(buffer, cols_as_map_type=cols_as_map_type) - got = _pandas_read_orc(buffer) + got = pd.read_orc(buffer) expected = pd.DataFrame(expected_data) assert_eq(got, expected) @@ -1795,14 +1794,14 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): buffer = BytesIO() negative_timestamp_df.to_orc(buffer) - assert_eq(negative_timestamp_df, _pandas_read_orc(buffer)) + assert_eq(negative_timestamp_df, pd.read_orc(buffer)) assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): path = datadir / "TestOrcFile.apache_timestamp.orc" - pdf = _pandas_read_orc(path) + pdf = pd.read_orc(path) gdf = cudf.read_orc(path) assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 7cca776ae64..de3bba25223 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -12,7 +12,7 @@ from fsspec.core import get_fs_token_paths import cudf -from cudf.testing._utils import _pandas_read_orc, assert_eq +from cudf.testing._utils import assert_eq moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -440,7 +440,7 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bucket = "orc" - expect = _pandas_read_orc(source_file) + expect = pd.read_orc(source_file) with open(source_file, "rb") as f: buffer = f.read() @@ -463,7 +463,7 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bucket = "orc" - expect = _pandas_read_orc(source_file) + expect = pd.read_orc(source_file) with open(source_file, "rb") as f: buffer = f.read() @@ -489,7 +489,7 @@ def test_write_orc(s3_base, s3so, pdf): assert s3fs.exists(f"s3://{bucket}/{fname}") with s3fs.open(f"s3://{bucket}/{fname}") as f: - got = _pandas_read_orc(f) + got = pd.read_orc(f) assert_eq(pdf, got)