Skip to content

Commit

Permalink
TEST-modin-project#2290: add usage of TestDatasetSize
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Myskov <[email protected]>
  • Loading branch information
amyskov committed Dec 1, 2020
1 parent 2899ff1 commit e76b657
Showing 1 changed file with 33 additions and 29 deletions.
62 changes: 33 additions & 29 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pandas
from pandas.errors import ParserWarning
from collections import OrderedDict
from modin.config import TestDatasetSize
from modin.utils import to_pandas
from modin.pandas.utils import from_arrow
from pathlib import Path
Expand Down Expand Up @@ -65,9 +66,15 @@
TEST_SAS_FILENAME = os.getcwd() + "/data/test1.sas7bdat"
TEST_FWF_FILENAME = "test_fwf.txt"
TEST_GBQ_FILENAME = "test_gbq."
SMALL_ROW_SIZE = 64
READ_CSV_SHARED_DATA_FILE = "read_csv_shared_data.txt"

DATASET_SIZE_DICT = {
"Small": 64,
"Normal": 2000,
"Big": 20000,
}

# Number of rows in the test file
NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Small"])

if not os.path.exists(IO_OPS_DATA_DIR):
os.mkdir(IO_OPS_DATA_DIR)
Expand All @@ -82,7 +89,7 @@ def make_parquet_file():
"""

def _make_parquet_file(
row_size=SMALL_ROW_SIZE, force=False, directory=False, partitioned_columns=[]
row_size=NROWS, force=False, directory=False, partitioned_columns=[]
):
"""Helper function to generate parquet files/directories.
Expand Down Expand Up @@ -168,7 +175,7 @@ def teardown_test_file(test_path):
def _make_csv_file(filenames):
def _csv_file_maker(
filename=TEST_CSV_FILENAME,
row_size=SMALL_ROW_SIZE,
row_size=NROWS,
force=True,
delimiter=",",
encoding=None,
Expand Down Expand Up @@ -300,18 +307,15 @@ def make_csv_file():


@pytest.fixture(scope="class")
def TestReadCSVFixture(worker_id):
def TestReadCSVFixture():
filenames = []
files_ids = [
"test_read_csv_regular",
"test_read_csv_blank_lines",
"test_read_csv_yes_no",
]
# each xdist worker spawned in separate process with separate namespace and dataset
pytest.csvs_names = {
file_id: get_unique_filename(file_id, debug_mode=True, suffix=worker_id)
for file_id in files_ids
}
pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids}
# test_read_csv_col_handling, test_read_csv_parsing
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_regular"],
Expand Down Expand Up @@ -807,47 +811,47 @@ def test_read_csv_datetime(


def test_from_parquet(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE)
make_parquet_file(NROWS)

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
df_equals(modin_df, pandas_df)


def test_from_parquet_with_columns(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE)
make_parquet_file(NROWS)

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
df_equals(modin_df, pandas_df)


def test_from_parquet_partition(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE, directory=True)
make_parquet_file(NROWS, directory=True)

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
df_equals(modin_df, pandas_df)


def test_from_parquet_partition_with_columns(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE, directory=True)
make_parquet_file(NROWS, directory=True)

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
df_equals(modin_df, pandas_df)


def test_from_parquet_partitioned_columns(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])
make_parquet_file(NROWS, partitioned_columns=["col1"])

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
df_equals(modin_df, pandas_df)


def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])
make_parquet_file(NROWS, partitioned_columns=["col1"])

pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
Expand Down Expand Up @@ -899,7 +903,7 @@ def test_from_parquet_hdfs():


def test_from_json():
setup_json_file(SMALL_ROW_SIZE)
setup_json_file(NROWS)

pandas_df = pandas.read_json(TEST_JSON_FILENAME)
modin_df = pd.read_json(TEST_JSON_FILENAME)
Expand All @@ -922,7 +926,7 @@ def test_from_json_categories():


def test_from_json_lines():
setup_json_lines_file(SMALL_ROW_SIZE)
setup_json_lines_file(NROWS)

pandas_df = pandas.read_json(TEST_JSON_FILENAME, lines=True)
modin_df = pd.read_json(TEST_JSON_FILENAME, lines=True)
Expand All @@ -945,7 +949,7 @@ def test_read_json_string_bytes(data):


def test_from_html():
setup_html_file(SMALL_ROW_SIZE)
setup_html_file(NROWS)

pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0]
modin_df = pd.read_html(TEST_HTML_FILENAME)
Expand All @@ -957,7 +961,7 @@ def test_from_html():

@pytest.mark.skip(reason="No clipboard on Travis")
def test_from_clipboard():
setup_clipboard(SMALL_ROW_SIZE)
setup_clipboard(NROWS)

pandas_df = pandas.read_clipboard()
modin_df = pd.read_clipboard()
Expand All @@ -967,7 +971,7 @@ def test_from_clipboard():

@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
def test_from_excel():
setup_excel_file(SMALL_ROW_SIZE)
setup_excel_file(NROWS)

pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME)
modin_df = pd.read_excel(TEST_EXCEL_FILENAME)
Expand All @@ -978,7 +982,7 @@ def test_from_excel():


def test_from_excel_engine():
setup_excel_file(SMALL_ROW_SIZE)
setup_excel_file(NROWS)

pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, engine="xlrd")
with pytest.warns(UserWarning):
Expand All @@ -990,7 +994,7 @@ def test_from_excel_engine():


def test_from_excel_index_col():
setup_excel_file(SMALL_ROW_SIZE)
setup_excel_file(NROWS)

pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, index_col=0)
with pytest.warns(UserWarning):
Expand All @@ -1002,7 +1006,7 @@ def test_from_excel_index_col():


def test_from_excel_all_sheets():
setup_excel_file(SMALL_ROW_SIZE)
setup_excel_file(NROWS)

pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, sheet_name=None)
modin_df = pd.read_excel(TEST_EXCEL_FILENAME, sheet_name=None)
Expand Down Expand Up @@ -1038,7 +1042,7 @@ def test_from_excel_sheet_name(sheet_name):

# @pytest.mark.skip(reason="Arrow version mismatch between Pandas and Feather")
def test_from_feather():
setup_feather_file(SMALL_ROW_SIZE)
setup_feather_file(NROWS)

pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
modin_df = pd.read_feather(TEST_FEATHER_FILENAME)
Expand All @@ -1050,7 +1054,7 @@ def test_from_feather():

@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
def test_from_hdf():
setup_hdf_file(SMALL_ROW_SIZE, format=None)
setup_hdf_file(NROWS, format=None)

pandas_df = pandas.read_hdf(TEST_READ_HDF_FILENAME, key="df")
modin_df = pd.read_hdf(TEST_READ_HDF_FILENAME, key="df")
Expand All @@ -1062,7 +1066,7 @@ def test_from_hdf():

@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
def test_from_hdf_format():
setup_hdf_file(SMALL_ROW_SIZE, format="table")
setup_hdf_file(NROWS, format="table")

pandas_df = pandas.read_hdf(TEST_READ_HDF_FILENAME, key="df")
modin_df = pd.read_hdf(TEST_READ_HDF_FILENAME, key="df")
Expand All @@ -1073,7 +1077,7 @@ def test_from_hdf_format():


def test_from_stata():
setup_stata_file(SMALL_ROW_SIZE)
setup_stata_file(NROWS)

pandas_df = pandas.read_stata(TEST_STATA_FILENAME)
modin_df = pd.read_stata(TEST_STATA_FILENAME)
Expand All @@ -1084,7 +1088,7 @@ def test_from_stata():


def test_from_pickle():
setup_pickle_file(SMALL_ROW_SIZE)
setup_pickle_file(NROWS)

pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
modin_df = pd.read_pickle(TEST_PICKLE_FILENAME)
Expand Down Expand Up @@ -1811,7 +1815,7 @@ def test_HDFStore():


def test_ExcelFile():
setup_excel_file(SMALL_ROW_SIZE)
setup_excel_file(NROWS)

modin_excel_file = pd.ExcelFile(TEST_EXCEL_FILENAME)
pandas_excel_file = pandas.ExcelFile(TEST_EXCEL_FILENAME)
Expand Down

0 comments on commit e76b657

Please sign in to comment.