diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 36039fbcea7..abb89de5ff0 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -32,7 +32,10 @@ from modin.config import NPartitions from modin.core.io.file_dispatcher import FileDispatcher, OpenFile from modin.core.io.text.utils import CustomNewlineIterator -from modin.core.storage_formats.pandas.utils import compute_chunksize +from modin.core.storage_formats.pandas.utils import ( + compute_chunksize, + compute_num_partitions, +) from modin.utils import _inherit_docstrings ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex]] @@ -1112,8 +1115,7 @@ def _read(cls, filepath_or_buffer, **kwargs): else: avg_line_len = lines_len // i appprox_nlines = cls.file_size(f) // avg_line_len - num_partitions = max(1, (appprox_nlines * len(column_names)) // 64_000) - num_partitions = min(NPartitions.get(), num_partitions) + num_partitions = compute_num_partitions(len(column_names), appprox_nlines) f.seek(old_pos) diff --git a/modin/core/storage_formats/pandas/utils.py b/modin/core/storage_formats/pandas/utils.py index b8f267b3150..4837edd35f1 100644 --- a/modin/core/storage_formats/pandas/utils.py +++ b/modin/core/storage_formats/pandas/utils.py @@ -57,6 +57,26 @@ def compute_chunksize(axis_len, num_splits, min_block_size=None): return max(chunksize, min_block_size) +def compute_num_partitions(ncols, nrows): + """ + Compute the number of partitions for a frame with the specified dimension. + + Parameters + ---------- + ncols : int + The number of frame columns. + nrows : int + The number of frame rows. + + Returns + ------- + int + The number of partitions. + """ + num_partitions = max(1, (ncols * nrows) // 64_000) + return min(NPartitions.get(), num_partitions) + + def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None): """ Split pandas DataFrame evenly based on the provided number of splits.