diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index e08929ff08..048aae331c 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -28,7 +28,7 @@ from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype from pandas.core.indexes.api import Index, RangeIndex -from modin.config import Engine, IsRayCluster, NPartitions +from modin.config import Engine, IsRayCluster, MinPartitionSize, NPartitions from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe from modin.core.dataframe.base.dataframe.utils import Axis, JoinType, is_trivial_index from modin.core.dataframe.pandas.dataframe.utils import ( @@ -1549,7 +1549,9 @@ def _reorder_labels(self, row_positions=None, col_positions=None): # the "standard" partitioning. Knowing the standard partitioning scheme # we are able to compute new row lengths. new_lengths = get_length_list( - axis_len=len(row_idx), num_splits=ordered_rows.shape[0] + axis_len=len(row_idx), + num_splits=ordered_rows.shape[0], + min_block_size=MinPartitionSize.get(), ) else: # If the frame's partitioning was preserved then @@ -1585,7 +1587,9 @@ def _reorder_labels(self, row_positions=None, col_positions=None): # the "standard" partitioning. Knowing the standard partitioning scheme # we are able to compute new column widths. new_widths = get_length_list( - axis_len=len(col_idx), num_splits=ordered_cols.shape[1] + axis_len=len(col_idx), + num_splits=ordered_cols.shape[1], + min_block_size=MinPartitionSize.get(), ) else: # If the frame's partitioning was preserved then @@ -3500,7 +3504,9 @@ def broadcast_apply_full_axis( if kw["row_lengths"] is None and is_index_materialized: if axis == 0: kw["row_lengths"] = get_length_list( - axis_len=len(new_index), num_splits=new_partitions.shape[0] + axis_len=len(new_index), + num_splits=new_partitions.shape[0], + min_block_size=MinPartitionSize.get(), ) elif axis == 1: if self._row_lengths_cache is not None and len(new_index) == sum( @@ -3512,6 +3518,7 @@ def broadcast_apply_full_axis( kw["column_widths"] = get_length_list( axis_len=len(new_columns), num_splits=new_partitions.shape[1], + min_block_size=MinPartitionSize.get(), ) elif axis == 0: if self._column_widths_cache is not None and len( diff --git a/modin/core/dataframe/pandas/partitioning/axis_partition.py b/modin/core/dataframe/pandas/partitioning/axis_partition.py index 4b4b7646a7..c89b012b1c 100644 --- a/modin/core/dataframe/pandas/partitioning/axis_partition.py +++ b/modin/core/dataframe/pandas/partitioning/axis_partition.py @@ -18,6 +18,7 @@ import numpy as np import pandas +from modin.config import MinPartitionSize from modin.core.dataframe.base.partitioning.axis_partition import ( BaseDataframeAxisPartition, ) @@ -276,6 +277,7 @@ def apply( for part in axis_partition.list_of_blocks ] ), + min_block_size=MinPartitionSize.get(), ) ) result = self._wrap_partitions( @@ -287,8 +289,9 @@ def apply( num_splits, maintain_partitioning, *self.list_of_blocks, - manual_partition=manual_partition, + min_block_size=MinPartitionSize.get(), lengths=lengths, + manual_partition=manual_partition, ) ) if self.full_axis: @@ -391,6 +394,7 @@ def deploy_axis_func( num_splits, maintain_partitioning, *partitions, + min_block_size, lengths=None, manual_partition=False, return_generator=False, @@ -415,6 +419,8 @@ def deploy_axis_func( If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). + min_block_size : int + Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False @@ -473,10 +479,16 @@ def deploy_axis_func( lengths = None if return_generator: return generate_result_of_axis_func_pandas( - axis, num_splits, result, lengths + axis, + num_splits, + result, + min_block_size, + lengths, ) else: - return split_result_of_axis_func_pandas(axis, num_splits, result, lengths) + return split_result_of_axis_func_pandas( + axis, num_splits, result, min_block_size, lengths + ) @classmethod def deploy_func_between_two_axis_partitions( @@ -489,6 +501,7 @@ def deploy_func_between_two_axis_partitions( len_of_left, other_shape, *partitions, + min_block_size, return_generator=False, ): """ @@ -513,6 +526,8 @@ def deploy_func_between_two_axis_partitions( (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. + min_block_size : int + Minimum number of rows/columns in a single split. return_generator : bool, default: False Return a generator from the function, set to `True` for Ray backend as Ray remote functions can return Generators. @@ -559,12 +574,14 @@ def deploy_func_between_two_axis_partitions( axis, num_splits, result, + min_block_size, ) else: return split_result_of_axis_func_pandas( axis, num_splits, result, + min_block_size, ) @classmethod diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 500818421b..a8ba574462 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -30,6 +30,7 @@ from modin.config import ( BenchmarkMode, Engine, + MinPartitionSize, NPartitions, PersistentPickle, ProgressBar, @@ -890,8 +891,9 @@ def from_pandas(cls, df, return_dims=False): A NumPy array with partitions (with dimensions or not). """ num_splits = NPartitions.get() - row_chunksize = compute_chunksize(df.shape[0], num_splits) - col_chunksize = compute_chunksize(df.shape[1], num_splits) + min_block_size = MinPartitionSize.get() + row_chunksize = compute_chunksize(df.shape[0], num_splits, min_block_size) + col_chunksize = compute_chunksize(df.shape[1], num_splits, min_block_size) bar_format = ( "{l_bar}{bar}{r_bar}" diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py index a78870205f..59c7bb4ccd 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py @@ -112,6 +112,7 @@ def deploy_axis_func( num_splits, maintain_partitioning, *partitions, + min_block_size, lengths=None, manual_partition=False, ): @@ -135,6 +136,8 @@ def deploy_axis_func( If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). + min_block_size : int + Minimum number of rows/columns in a single split. lengths : iterable, default: None The list of lengths to shuffle the partition into. manual_partition : bool, default: False @@ -159,6 +162,7 @@ def deploy_axis_func( *partitions, ), f_kwargs={ + "min_block_size": min_block_size, "lengths": lengths, "manual_partition": manual_partition, }, @@ -177,6 +181,7 @@ def deploy_func_between_two_axis_partitions( len_of_left, other_shape, *partitions, + min_block_size, ): """ Deploy a function along a full axis between two data sets. @@ -200,6 +205,8 @@ def deploy_func_between_two_axis_partitions( (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. + min_block_size : int + Minimum number of rows/columns in a single split. Returns ------- @@ -219,6 +226,9 @@ def deploy_func_between_two_axis_partitions( other_shape, *partitions, ), + f_kwargs={ + "min_block_size": min_block_size, + }, num_returns=num_splits * (1 + cls._PARTITIONS_METADATA_LEN), pure=False, ) diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition_manager.py index 038ff0c96c..4fc5e84a18 100644 --- a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition_manager.py +++ b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition_manager.py @@ -16,7 +16,7 @@ import numpy as np import ray -from modin.config import GpuCount +from modin.config import GpuCount, MinPartitionSize from modin.core.execution.ray.common import RayWrapper from modin.core.execution.ray.generic.partitioning import ( GenericRayDataframePartitionManager, @@ -125,7 +125,9 @@ def from_pandas(cls, df, return_dims=False): num_splits = GpuCount.get() put_func = cls._partition_class.put # For now, we default to row partitioning - pandas_dfs = split_result_of_axis_func_pandas(0, num_splits, df) + pandas_dfs = split_result_of_axis_func_pandas( + 0, num_splits, df, min_block_size=MinPartitionSize.get() + ) keys = [ put_func(cls._get_gpu_managers()[i], pandas_dfs[i]) for i in range(num_splits) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py index f30b474305..91499e2ee8 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py @@ -137,6 +137,7 @@ def deploy_axis_func( num_splits, maintain_partitioning, *partitions, + min_block_size, lengths=None, manual_partition=False, max_retries=None, @@ -161,6 +162,8 @@ def deploy_axis_func( If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). + min_block_size : int + Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False @@ -188,6 +191,7 @@ def deploy_axis_func( f_len_args=len(f_args), f_kwargs=f_kwargs, manual_partition=manual_partition, + min_block_size=min_block_size, lengths=lengths, return_generator=True, ) @@ -203,6 +207,7 @@ def deploy_func_between_two_axis_partitions( len_of_left, other_shape, *partitions, + min_block_size, ): """ Deploy a function along a full axis between two data sets. @@ -226,6 +231,8 @@ def deploy_func_between_two_axis_partitions( (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. + min_block_size : int + Minimum number of rows/columns in a single split. Returns ------- @@ -245,6 +252,7 @@ def deploy_func_between_two_axis_partitions( f_to_deploy=func, f_len_args=len(f_args), f_kwargs=f_kwargs, + min_block_size=min_block_size, return_generator=True, ) diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py index 002193f188..f4e8f456e5 100644 --- a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py +++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py @@ -138,6 +138,7 @@ def deploy_axis_func( num_splits, maintain_partitioning, *partitions, + min_block_size, lengths=None, manual_partition=False, max_retries=None, @@ -162,6 +163,8 @@ def deploy_axis_func( If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). + min_block_size : int + Minimum number of rows/columns in a single split. lengths : list, optional The list of lengths to shuffle the object. manual_partition : bool, default: False @@ -188,6 +191,7 @@ def deploy_axis_func( maintain_partitioning, *partitions, manual_partition=manual_partition, + min_block_size=min_block_size, lengths=lengths, ) @@ -202,6 +206,7 @@ def deploy_func_between_two_axis_partitions( len_of_left, other_shape, *partitions, + min_block_size, ): """ Deploy a function along a full axis between two data sets. @@ -225,6 +230,8 @@ def deploy_func_between_two_axis_partitions( (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. + min_block_size : int + Minimum number of rows/columns in a single split. Returns ------- @@ -243,6 +250,7 @@ def deploy_func_between_two_axis_partitions( len_of_left, other_shape, *partitions, + min_block_size=min_block_size, ) def wait(self): diff --git a/modin/core/io/column_stores/column_store_dispatcher.py b/modin/core/io/column_stores/column_store_dispatcher.py index a35e146900..94d801ac80 100644 --- a/modin/core/io/column_stores/column_store_dispatcher.py +++ b/modin/core/io/column_stores/column_store_dispatcher.py @@ -121,7 +121,6 @@ def build_index(cls, partition_ids): row_lengths : list List with lengths of index chunks. """ - num_partitions = NPartitions.get() index_len = ( 0 if len(partition_ids) == 0 else cls.materialize(partition_ids[-2][0]) ) @@ -130,7 +129,9 @@ def build_index(cls, partition_ids): else: index = index_len index_len = len(index) - index_chunksize = compute_chunksize(index_len, num_partitions) + num_partitions = NPartitions.get() + min_block_size = MinPartitionSize.get() + index_chunksize = compute_chunksize(index_len, num_partitions, min_block_size) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)] else: diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index e88a772377..62245d846b 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -722,7 +722,9 @@ def _normalize_partitioning(cls, remote_parts, row_lengths, column_widths): for i in range(num_splits): new_parts[offset + i].append(split[i]) - new_row_lengths.extend(get_length_list(part_len, num_splits)) + new_row_lengths.extend( + get_length_list(part_len, num_splits, MinPartitionSize.get()) + ) remote_parts = np.array(new_parts) row_lengths = new_row_lengths @@ -746,7 +748,9 @@ def _normalize_partitioning(cls, remote_parts, row_lengths, column_widths): for row_parts in remote_parts ] ) - column_widths = get_length_list(sum(column_widths), desired_col_nparts) + column_widths = get_length_list( + sum(column_widths), desired_col_nparts, MinPartitionSize.get() + ) return remote_parts, row_lengths, column_widths diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index d80e5ecf6d..db583700fe 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -30,7 +30,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.io.common import stringify_path -from modin.config import NPartitions +from modin.config import MinPartitionSize, NPartitions from modin.core.io.file_dispatcher import FileDispatcher, OpenFile from modin.core.io.text.utils import CustomNewlineIterator from modin.core.storage_formats.pandas.utils import compute_chunksize @@ -571,7 +571,8 @@ def _define_metadata( """ # This is the number of splits for the columns num_splits = min(len(column_names) or 1, NPartitions.get()) - column_chunksize = compute_chunksize(df.shape[1], num_splits) + min_block_size = MinPartitionSize.get() + column_chunksize = compute_chunksize(df.shape[1], num_splits, min_block_size) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty diff --git a/modin/core/storage_formats/cudf/parser.py b/modin/core/storage_formats/cudf/parser.py index ac206cfacb..dc428079a3 100644 --- a/modin/core/storage_formats/cudf/parser.py +++ b/modin/core/storage_formats/cudf/parser.py @@ -20,6 +20,7 @@ from pandas.core.dtypes.concat import union_categoricals from pandas.io.common import infer_compression +from modin.config import MinPartitionSize from modin.core.execution.ray.implementations.cudf_on_ray.partitioning.partition_manager import ( GPU_MANAGERS, ) @@ -39,7 +40,9 @@ def _split_result_for_readers(axis, num_splits, df): # pragma: no cover Returns: A list of pandas DataFrames. """ - splits = split_result_of_axis_func_pandas(axis, num_splits, df) + splits = split_result_of_axis_func_pandas( + axis, num_splits, df, min_block_size=MinPartitionSize.get() + ) if not isinstance(splits, list): splits = [splits] return splits diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index a396bf3e17..b6ca16dafb 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -54,6 +54,7 @@ from pandas.io.common import infer_compression from pandas.util._decorators import doc +from modin.config import MinPartitionSize from modin.core.io.file_dispatcher import OpenFile from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.db_conn import ModinDatabaseConnection @@ -113,7 +114,9 @@ def _split_result_for_readers(axis, num_splits, df): # pragma: no cover list A list of pandas DataFrames. """ - splits = split_result_of_axis_func_pandas(axis, num_splits, df) + splits = split_result_of_axis_func_pandas( + axis, num_splits, df, min_block_size=MinPartitionSize.get() + ) if not isinstance(splits, list): splits = [splits] return splits diff --git a/modin/core/storage_formats/pandas/utils.py b/modin/core/storage_formats/pandas/utils.py index 346982fc55..ac3297cb5a 100644 --- a/modin/core/storage_formats/pandas/utils.py +++ b/modin/core/storage_formats/pandas/utils.py @@ -13,9 +13,11 @@ """Contains utility functions for frame partitioning.""" +from __future__ import annotations + import re from math import ceil -from typing import Hashable, List +from typing import Generator, Hashable, List, Optional import numpy as np import pandas @@ -23,7 +25,7 @@ from modin.config import MinPartitionSize, NPartitions -def compute_chunksize(axis_len, num_splits, min_block_size=None): +def compute_chunksize(axis_len: int, num_splits: int, min_block_size: int) -> int: """ Compute the number of elements (rows/columns) to include in each partition. @@ -35,19 +37,18 @@ def compute_chunksize(axis_len, num_splits, min_block_size=None): Element count in an axis. num_splits : int The number of splits. - min_block_size : int, optional + min_block_size : int Minimum number of rows/columns in a single split. - If not specified, the value is assumed equal to ``MinPartitionSize``. Returns ------- int Integer number of rows/columns to split the DataFrame will be returned. """ - if min_block_size is None: - min_block_size = MinPartitionSize.get() - - assert min_block_size > 0, "`min_block_size` should be > 0" + if not isinstance(min_block_size, int) or min_block_size <= 0: + raise ValueError( + f"'min_block_size' should be int > 0, passed: {min_block_size=}" + ) chunksize = axis_len // num_splits if axis_len % num_splits: @@ -58,8 +59,12 @@ def compute_chunksize(axis_len, num_splits, min_block_size=None): def split_result_of_axis_func_pandas( - axis, num_splits, result, length_list=None, min_block_size=None -): + axis: int, + num_splits: int, + result: pandas.DataFrame, + min_block_size: int, + length_list: Optional[list] = None, +) -> list[pandas.DataFrame]: """ Split pandas DataFrame evenly based on the provided number of splits. @@ -72,12 +77,11 @@ def split_result_of_axis_func_pandas( This parameter is ignored if `length_list` is specified. result : pandas.DataFrame DataFrame to split. + min_block_size : int + Minimum number of rows/columns in a single split. length_list : list of ints, optional List of slice lengths to split DataFrame into. This is used to return the DataFrame to its original partitioning schema. - min_block_size : int, optional - Minimum number of rows/columns in a single split. - If not specified, the value is assumed equal to ``MinPartitionSize``. Returns ------- @@ -86,14 +90,18 @@ def split_result_of_axis_func_pandas( """ return list( generate_result_of_axis_func_pandas( - axis, num_splits, result, length_list, min_block_size + axis, num_splits, result, min_block_size, length_list ) ) def generate_result_of_axis_func_pandas( - axis, num_splits, result, length_list=None, min_block_size=None -): + axis: int, + num_splits: int, + result: pandas.DataFrame, + min_block_size: int, + length_list: Optional[list] = None, +) -> Generator: """ Generate pandas DataFrame evenly based on the provided number of splits. @@ -106,12 +114,11 @@ def generate_result_of_axis_func_pandas( This parameter is ignored if `length_list` is specified. result : pandas.DataFrame DataFrame to split. + min_block_size : int + Minimum number of rows/columns in a single split. length_list : list of ints, optional List of slice lengths to split DataFrame into. This is used to return the DataFrame to its original partitioning schema. - min_block_size : int, optional - Minimum number of rows/columns in a single split. - If not specified, the value is assumed equal to ``MinPartitionSize``. Yields ------ @@ -146,7 +153,7 @@ def generate_result_of_axis_func_pandas( yield chunk -def get_length_list(axis_len: int, num_splits: int, min_block_size=None) -> list: +def get_length_list(axis_len: int, num_splits: int, min_block_size: int) -> list: """ Compute partitions lengths along the axis with the specified number of splits. @@ -156,9 +163,8 @@ def get_length_list(axis_len: int, num_splits: int, min_block_size=None) -> list Element count in an axis. num_splits : int Number of splits along the axis. - min_block_size : int, optional + min_block_size : int Minimum number of rows/columns in a single split. - If not specified, the value is assumed equal to ``MinPartitionSize``. Returns ------- @@ -245,7 +251,11 @@ def merge_partitioning(left, right, axis=1): if lshape is not None and rshape is not None: res_shape = sum(lshape) + sum(rshape) - chunk_size = compute_chunksize(axis_len=res_shape, num_splits=NPartitions.get()) + chunk_size = compute_chunksize( + axis_len=res_shape, + num_splits=NPartitions.get(), + min_block_size=MinPartitionSize.get(), + ) return ceil(res_shape / chunk_size) else: lsplits = left._partitions.shape[axis] diff --git a/modin/tests/core/storage_formats/pandas/test_internals.py b/modin/tests/core/storage_formats/pandas/test_internals.py index 811e77dc44..56c1563ba2 100644 --- a/modin/tests/core/storage_formats/pandas/test_internals.py +++ b/modin/tests/core/storage_formats/pandas/test_internals.py @@ -141,6 +141,7 @@ def construct_modin_df_by_scheme(pandas_df, partitioning_scheme): axis=0, num_splits=len(row_lengths), result=pandas_df, + min_block_size=MinPartitionSize.get(), length_list=row_lengths, ) partitions = [ @@ -148,6 +149,7 @@ def construct_modin_df_by_scheme(pandas_df, partitioning_scheme): axis=1, num_splits=len(column_widths), result=row_part, + min_block_size=MinPartitionSize.get(), length_list=column_widths, ) for row_part in row_partitions diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index f3ff13a3e0..b6dc1686ff 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat +from modin.config import MinPartitionSize, NPartitions, StorageFormat from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.pandas.testing import assert_index_equal, assert_series_equal @@ -598,7 +598,11 @@ def _get_lazy_proxy(): original_dtype = pandas_df.astype({"a": "category"}).dtypes["a"] chunks = split_result_of_axis_func_pandas( - axis=0, num_splits=nchunks, result=pandas_df, length_list=[2, 2, 2] + axis=0, + num_splits=nchunks, + result=pandas_df, + min_block_size=MinPartitionSize.get(), + length_list=[2, 2, 2], ) if StorageFormat.get() == "Pandas": diff --git a/modin/tests/pandas/internals/test_repartition.py b/modin/tests/pandas/internals/test_repartition.py index 389541a9d0..cfbd7128a3 100644 --- a/modin/tests/pandas/internals/test_repartition.py +++ b/modin/tests/pandas/internals/test_repartition.py @@ -11,10 +11,11 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import numpy as np import pytest import modin.pandas as pd -from modin.config import NPartitions +from modin.config import NPartitions, context NPartitions.put(4) @@ -58,3 +59,9 @@ def test_repartition(axis, dtype): } assert obj._query_compiler._modin_frame._partitions.shape == results[axis] + + +def test_repartition_7170(): + with context(MinPartitionSize=102, NPartitions=5): + df = pd.DataFrame(np.random.rand(10000, 100)) + _ = df._repartition(axis=1).to_numpy()