Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#2294: Cover by tests Iteration parameters of read_csv #2477

Merged
merged 1 commit into from
Dec 2, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 32 additions & 31 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,9 +759,9 @@ def test_read_csv_nans_handling(
skip_blank_lines,
):
eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"],
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"],
na_values=na_values,
keep_default_na=keep_default_na,
na_filter=na_filter,
Expand Down Expand Up @@ -829,6 +829,37 @@ def test_read_csv_datetime(
cache_dates=cache_dates,
)

# Iteration tests
@pytest.mark.parametrize("iterator", [True, False])
def test_read_csv_iteration(self, make_csv_file, iterator):
filename = pytest.csvs_names["test_read_csv_regular"]

# Tests __next__ and correctness of reader as an iterator
# Use larger chunksize to read through file quicker
rdf_reader = pd.read_csv(filename, chunksize=500, iterator=iterator)
pd_reader = pandas.read_csv(filename, chunksize=500, iterator=iterator)

for modin_df, pd_df in zip(rdf_reader, pd_reader):
df_equals(modin_df, pd_df)

# Tests that get_chunk works correctly
rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator)
pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator)

modin_df = rdf_reader.get_chunk(1)
pd_df = pd_reader.get_chunk(1)

df_equals(modin_df, pd_df)

# Tests that read works correctly
rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator)
pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator)

modin_df = rdf_reader.read()
pd_df = pd_reader.read()

df_equals(modin_df, pd_df)


def test_from_parquet(make_parquet_file):
make_parquet_file(NROWS)
Expand Down Expand Up @@ -1459,36 +1490,6 @@ def test_from_csv_default(make_csv_file):
df_equals(modin_df, pandas_df)


def test_from_csv_chunksize(make_csv_file):
make_csv_file()

# Tests __next__ and correctness of reader as an iterator
# Use larger chunksize to read through file quicker
rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=500)
pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=500)

for modin_df, pd_df in zip(rdf_reader, pd_reader):
df_equals(modin_df, pd_df)

# Tests that get_chunk works correctly
rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=1)
pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=1)

modin_df = rdf_reader.get_chunk(1)
pd_df = pd_reader.get_chunk(1)

df_equals(modin_df, pd_df)

# Tests that read works correctly
rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=1)
pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=1)

modin_df = rdf_reader.read()
pd_df = pd_reader.read()

df_equals(modin_df, pd_df)


@pytest.mark.parametrize("names", [list("XYZ"), None])
@pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None])
def test_from_csv_skiprows_names(names, skiprows):
Expand Down