diff --git a/tests/io_test.py b/tests/io_test.py index c1c9cfb7ed..30d0ef9ad2 100644 --- a/tests/io_test.py +++ b/tests/io_test.py @@ -317,6 +317,31 @@ def test_edge_case_read_write(self, par_test_base_tmp, dtype, comp): else: assert (np_edge_case == pq_arr.to_ndarray()).all() + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_large_parquet_io(self,par_test_base_tmp,prob_size) : + + with tempfile.TemporaryDirectory(dir=par_test_base_tmp) as tmp_dirname: + filename = f"{tmp_dirname}/pq_test_large_parquet" + size = max(prob_size,2**21 + 8) # A problem had been detected with parquet files of > 2**21 entries + bool_array = np.array((size//2)*[True, False]).tolist() + flt_array = np.arange(size).astype(np.float64).tolist() + int_array = np.arange(size).astype(np.int64).tolist() + str_array = np.array(["a"+str(i) for i in np.arange(size)]).tolist() + arrays = [bool_array, int_array, flt_array, str_array] + tuples = list(zip(*arrays)) + names = ['first', 'second', 'third', 'fourth'] + index = pd.MultiIndex.from_tuples(tuples, names=names) + s = pd.Series(np.random.randn(size), index=index) + df = s.to_frame() + df.to_parquet(filename) + ak_df = ak.DataFrame(ak.read_parquet(filename)) + # This check is on all of the random numbers generated in s + assert np.all(ak_df.to_pandas().values[:, 0] == s.values) + # This check is on all of the elements of the MultiIndex + for i in range(len(names)) : + assert np.all(df.index.get_level_values(names[i]).to_numpy() == ak_df[names[i]].to_ndarray()) + + @pytest.mark.parametrize("dtype", NUMERIC_AND_STR_TYPES) def test_get_datasets(self, par_test_base_tmp, dtype): ak_arr = make_ak_arrays(10, dtype)