diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index d152503e82a..88850ff6687 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -20,8 +20,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: data_type get_timestamp_type() except + bool is_enabled_convert_strings_to_categories() except + bool is_enabled_use_pandas_metadata() except + - size_type get_skip_rows() except + - size_type get_num_rows() except + # setter @@ -29,8 +27,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_row_groups(vector[vector[size_type]] row_grp) except + void enable_convert_strings_to_categories(bool val) except + void enable_use_pandas_metadata(bool val) except + - void set_skip_rows(size_type val) except + - void set_num_rows(size_type val) except + void set_timestamp_type(data_type type) except + @staticmethod @@ -55,8 +51,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_reader_options_builder& use_pandas_metadata( bool val ) except + - parquet_reader_options_builder& skip_rows(size_type val) except + - parquet_reader_options_builder& num_rows(size_type val) except + parquet_reader_options_builder& timestamp_type( data_type type ) except + diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c25360b307d..1be3b953687 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -125,7 +125,7 @@ def _parse_metadata(meta): cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - skiprows=None, num_rows=None, strings_to_categorical=False, + strings_to_categorical=False, use_pandas_metadata=True): """ Cython function to call into libcudf API, see `read_parquet`. @@ -151,8 +151,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cdef bool cpp_strings_to_categorical = strings_to_categorical cdef bool cpp_use_pandas_metadata = use_pandas_metadata - cdef size_type cpp_skiprows = skiprows if skiprows is not None else 0 - cdef size_type cpp_num_rows = num_rows if num_rows is not None else -1 + cdef vector[vector[size_type]] cpp_row_groups cdef data_type cpp_timestamp_type = cudf_types.data_type( cudf_types.type_id.EMPTY @@ -168,8 +167,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, .row_groups(cpp_row_groups) .convert_strings_to_categories(cpp_strings_to_categorical) .use_pandas_metadata(cpp_use_pandas_metadata) - .skip_rows(cpp_skiprows) - .num_rows(cpp_num_rows) .timestamp_type(cpp_timestamp_type) .build() ) @@ -291,10 +288,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, step=range_index_meta['step'], name=range_index_meta['name'] ) - if skiprows is not None: - idx = idx[skiprows:] - if num_rows is not None: - idx = idx[:num_rows] + df._index = idx elif set(index_col).issubset(column_names): index_data = df[index_col] diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 5a181dc076c..1812155d894 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -359,8 +359,6 @@ def read_parquet( columns=None, filters=None, row_groups=None, - skiprows=None, - num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, use_python_file_object=True, @@ -371,18 +369,6 @@ def read_parquet( ): """{docstring}""" - if skiprows is not None: - warnings.warn( - "skiprows is deprecated and will be removed.", - FutureWarning, - ) - - if num_rows is not None: - warnings.warn( - "num_rows is deprecated and will be removed.", - FutureWarning, - ) - # Do not allow the user to set file-opening options # when `use_python_file_object=False` is specified if use_python_file_object is False: @@ -485,8 +471,6 @@ def read_parquet( *args, columns=columns, row_groups=row_groups, - skiprows=skiprows, - num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, partition_keys=partition_keys, @@ -575,8 +559,6 @@ def _read_parquet( engine, columns=None, row_groups=None, - skiprows=None, - num_rows=None, strings_to_categorical=None, use_pandas_metadata=None, *args, @@ -589,8 +571,6 @@ def _read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, - skiprows=skiprows, - num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 973f8c75553..326c117585b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -618,30 +618,6 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size): assert_eq(ref_df, gdf) -@pytest.mark.filterwarnings( - "ignore:skiprows is deprecated and will be removed." -) -@pytest.mark.filterwarnings( - "ignore:num_rows is deprecated and will be removed." -) -@pytest.mark.parametrize("row_group_size", [1, 4, 33]) -def test_parquet_read_rows(tmpdir, pdf, row_group_size): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - fname = tmpdir.join("row_group.parquet") - pdf.to_parquet(fname, compression="None", row_group_size=row_group_size) - - total_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname) - - num_rows = total_rows // 4 - skiprows = (total_rows - num_rows) // 2 - gdf = cudf.read_parquet(fname, skiprows=skiprows, num_rows=num_rows) - - for row in range(num_rows): - assert gdf["col_int32"].iloc[row] == row + skiprows - - def test_parquet_reader_spark_timestamps(datadir): fname = datadir / "spark_timestamp.snappy.parquet" @@ -708,36 +684,6 @@ def test_parquet_reader_invalids(tmpdir): assert_eq(expect, got) -@pytest.mark.filterwarnings( - "ignore:skiprows is deprecated and will be removed." -) -@pytest.mark.filterwarnings( - "ignore:num_rows is deprecated and will be removed." -) -def test_parquet_chunked_skiprows(tmpdir): - processed = 0 - batch = 10000 - n = 100000 - out_df = cudf.DataFrame( - { - "y": np.arange(n), - "z": np.random.choice(range(1000000, 2000000), n, replace=False), - "s": np.random.choice(range(20), n, replace=True), - "a": np.round(np.random.uniform(1, 5000, n), 2), - } - ) - - fname = tmpdir.join("skiprows.parquet") - out_df.to_pandas().to_parquet(fname) - - for i in range(10): - chunk = cudf.read_parquet(fname, skiprows=processed, num_rows=batch) - expect = out_df[processed : processed + batch].reset_index(drop=True) - assert_eq(chunk.reset_index(drop=True), expect) - processed += batch - del chunk - - def test_parquet_reader_filenotfound(tmpdir): with pytest.raises(FileNotFoundError): cudf.read_parquet("TestMissingFile.parquet") @@ -987,20 +933,14 @@ def L(list_size, first_val): ] -def list_gen( - gen, skiprows, num_rows, lists_per_row, list_size, include_validity=False -): +def list_gen(gen, num_rows, lists_per_row, list_size, include_validity=False): """ Generate a list column based on input parameters. Args: gen: A callable which generates an individual leaf element based on an absolute index. - skiprows : Generate the column as if it had started at 'skiprows' - instead of 0. The intent here is to emulate the skiprows - parameter of the parquet reader. - num_rows : Number of rows to generate. Again, this is to emulate the - 'num_rows' parameter of the parquet reader. + num_rows : Number of rows to generate. lists_per_row : Number of lists to generate per row. list_size : Size of each generated list. include_validity : Whether or not to include nulls as part of the @@ -1028,16 +968,16 @@ def R(first_val, lists_per_row, list_size): return [ ( R( - lists_per_row * list_size * (i + skiprows), + lists_per_row * list_size * i, lists_per_row, list_size, ) - if (i + skiprows) % 2 == 0 + if i % 2 == 0 else None ) if include_validity else R( - lists_per_row * list_size * (i + skiprows), + lists_per_row * list_size * i, lists_per_row, list_size, ) @@ -1046,7 +986,7 @@ def R(first_val, lists_per_row, list_size): def test_parquet_reader_list_large(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 0, 256, 80, 50)}) + expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50)}) fname = tmpdir.join("test_parquet_reader_list_large.parquet") expect.to_parquet(fname) assert os.path.exists(fname) @@ -1056,7 +996,7 @@ def test_parquet_reader_list_large(tmpdir): def test_parquet_reader_list_validity(tmpdir): expect = pd.DataFrame( - {"a": list_gen(int_gen, 0, 256, 80, 50, include_validity=True)} + {"a": list_gen(int_gen, 256, 80, 50, include_validity=True)} ) fname = tmpdir.join("test_parquet_reader_list_validity.parquet") expect.to_parquet(fname) @@ -1068,10 +1008,10 @@ def test_parquet_reader_list_validity(tmpdir): def test_parquet_reader_list_large_mixed(tmpdir): expect = pd.DataFrame( { - "a": list_gen(string_gen, 0, 128, 80, 50), - "b": list_gen(int_gen, 0, 128, 80, 50), - "c": list_gen(int_gen, 0, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 0, 128, 80, 50, include_validity=True), + "a": list_gen(string_gen, 128, 80, 50), + "b": list_gen(int_gen, 128, 80, 50), + "c": list_gen(int_gen, 128, 80, 50, include_validity=True), + "d": list_gen(string_gen, 128, 80, 50, include_validity=True), } ) fname = tmpdir.join("test_parquet_reader_list_large_mixed.parquet") @@ -1119,7 +1059,7 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): row_group_size = 1000 expect = cudf.DataFrame( - {"a": list_gen(int_gen, 0, num_rows, 3, 2, include_validity=True)} + {"a": list_gen(int_gen, num_rows, 3, 2, include_validity=True)} ) # round trip the dataframe to/from parquet @@ -1132,61 +1072,6 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): assert_eq(expect, got) -@pytest.mark.filterwarnings( - "ignore:skiprows is deprecated and will be removed." -) -@pytest.mark.parametrize("skip", [0, 1, 5, 10]) -def test_parquet_reader_list_skiprows(skip, tmpdir): - num_rows = 10 - src = pd.DataFrame( - { - "a": list_gen(int_gen, 0, num_rows, 80, 50), - "b": list_gen(string_gen, 0, num_rows, 80, 50), - "c": list_gen(int_gen, 0, num_rows, 80, 50, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_reader_list_skiprows.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - - expect = src.iloc[skip:] - got = cudf.read_parquet(fname, skiprows=skip) - if expect.empty: - assert_eq(expect, got) - else: - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - -@pytest.mark.filterwarnings( - "ignore:skiprows is deprecated and will be removed." -) -@pytest.mark.filterwarnings( - "ignore:num_rows is deprecated and will be removed." -) -@pytest.mark.parametrize("skip", [0, 1, 5, 10]) -def test_parquet_reader_list_num_rows(skip, tmpdir): - num_rows = 20 - src = pd.DataFrame( - { - "a": list_gen(int_gen, 0, num_rows, 80, 50), - "b": list_gen(string_gen, 0, num_rows, 80, 50), - "c": list_gen(int_gen, 0, num_rows, 80, 50, include_validity=True), - "d": list_gen( - string_gen, 0, num_rows, 80, 50, include_validity=True - ), - } - ) - fname = tmpdir.join("test_parquet_reader_list_num_rows.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - - # make sure to leave a few rows at the end that we don't read - rows_to_read = min(3, (num_rows - skip) - 5) - expect = src.iloc[skip:].head(rows_to_read) - got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read) - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - def struct_gen(gen, skip_rows, num_rows, include_validity=False): """ Generate a struct column based on input parameters. @@ -2069,7 +1954,7 @@ def test_parquet_writer_list_basic(tmpdir): def test_parquet_writer_list_large(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 0, 256, 80, 50)}) + expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50)}) fname = tmpdir.join("test_parquet_writer_list_large.parquet") gdf = cudf.from_pandas(expect) @@ -2084,10 +1969,10 @@ def test_parquet_writer_list_large(tmpdir): def test_parquet_writer_list_large_mixed(tmpdir): expect = pd.DataFrame( { - "a": list_gen(string_gen, 0, 128, 80, 50), - "b": list_gen(int_gen, 0, 128, 80, 50), - "c": list_gen(int_gen, 0, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 0, 128, 80, 50, include_validity=True), + "a": list_gen(string_gen, 128, 80, 50), + "b": list_gen(int_gen, 128, 80, 50), + "c": list_gen(int_gen, 128, 80, 50, include_validity=True), + "d": list_gen(string_gen, 128, 80, 50, include_validity=True), } ) fname = tmpdir.join("test_parquet_writer_list_large_mixed.parquet") @@ -2103,18 +1988,18 @@ def test_parquet_writer_list_large_mixed(tmpdir): def test_parquet_writer_list_chunked(tmpdir): table1 = cudf.DataFrame( { - "a": list_gen(string_gen, 0, 128, 80, 50), - "b": list_gen(int_gen, 0, 128, 80, 50), - "c": list_gen(int_gen, 0, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 0, 128, 80, 50, include_validity=True), + "a": list_gen(string_gen, 128, 80, 50), + "b": list_gen(int_gen, 128, 80, 50), + "c": list_gen(int_gen, 128, 80, 50, include_validity=True), + "d": list_gen(string_gen, 128, 80, 50, include_validity=True), } ) table2 = cudf.DataFrame( { - "a": list_gen(string_gen, 0, 128, 80, 50), - "b": list_gen(int_gen, 0, 128, 80, 50), - "c": list_gen(int_gen, 0, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 0, 128, 80, 50, include_validity=True), + "a": list_gen(string_gen, 128, 80, 50), + "b": list_gen(int_gen, 128, 80, 50), + "c": list_gen(int_gen, 128, 80, 50, include_validity=True), + "d": list_gen(string_gen, 128, 80, 50, include_validity=True), } ) fname = tmpdir.join("test_parquet_writer_list_chunked.parquet") @@ -2295,10 +2180,10 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): def test_parquet_writer_list_statistics(tmpdir): df = pd.DataFrame( { - "a": list_gen(string_gen, 0, 128, 80, 50), - "b": list_gen(int_gen, 0, 128, 80, 50), - "c": list_gen(int_gen, 0, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 0, 128, 80, 50, include_validity=True), + "a": list_gen(string_gen, 128, 80, 50), + "b": list_gen(int_gen, 128, 80, 50), + "c": list_gen(int_gen, 128, 80, 50, include_validity=True), + "d": list_gen(string_gen, 128, 80, 50, include_validity=True), } ) fname = tmpdir.join("test_parquet_writer_list_statistics.parquet") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d3c41de842a..f915da5fe69 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -150,10 +150,6 @@ If not None, specifies, for each input file, which row groups to read. If reading multiple inputs, a list of lists should be passed, one list for each input. -skiprows : int, default None - If not None, the number of rows to skip from the start of the file. -num_rows : int, default None - If not None, the total number of rows to read. strings_to_categorical : boolean, default False If True, return string columns as GDF_CATEGORY dtype; if False, return a as GDF_STRING dtype.