diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0f20f4a1fcb..c301c2d3f54 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -187,6 +187,8 @@ jobs: run: python -m pytest modin/test/backends/base/test_internals.py - shell: bash -l {0} run: python -m pytest modin/test/backends/pandas/test_internals.py + - shell: bash -l {0} + run: python -m pytest modin/test/test_envvar_npartitions.py test-defaults: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index ed5f506bd75..607c337230e 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -29,6 +29,8 @@ jobs: run: python -m pytest modin/test/test_envvar_catcher.py - shell: bash -l {0} run: python -m pytest modin/test/backends/pandas/test_internals.py + - shell: bash -l {0} + run: python -m pytest modin/test/test_envvar_npartitions.py test-defaults: runs-on: ubuntu-latest diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index f548f4c4b28..f1f7b7a7740 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -22,6 +22,7 @@ import pandas from .utils import generate_dataframe, RAND_LOW, RAND_HIGH, random_string +from modin.config import NPartitions try: from modin.config import TestDatasetSize, AsvImplementation @@ -32,7 +33,7 @@ # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = "modin" - ASV_DATASET_SIZE = "Big" if pd.DEFAULT_NPARTITIONS >= 32 else "Small" + ASV_DATASET_SIZE = "Big" if NPartitions.get() >= 32 else "Small" if ASV_DATASET_SIZE == "Big": BINARY_OP_DATA_SIZE = [ diff --git a/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb b/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb index 341d2a23010..af43065f849 100644 --- a/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb +++ b/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb @@ -109,8 +109,8 @@ "time.sleep(600) # We need to give ray enough time to start up all the workers\n", "import ray\n", "ray.init(address=\"auto\")\n", - "import modin.pandas as pd\n", - "assert pd.DEFAULT_NPARTITIONS == 768, \"Not all Ray nodes are started up yet\"\n", + "from modin.config import NPartitions\n", + "assert NPartitions.get() == 768, \"Not all Ray nodes are started up yet\"\n", "ray.shutdown()" ] }, diff --git a/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb b/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb index f4e4e82dcca..06a2de08675 100644 --- a/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb +++ b/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb @@ -39,7 +39,8 @@ "import ray\n", "ray.init(address=\"auto\")\n", "import modin.pandas as pd\n", - "if pd.DEFAULT_NPARTITIONS != 768:\n", + "from modin.config import NPartitions\n", + "if NPartitions.get() != 768:\n", " print(\"This notebook was designed and tested for an 8 node Ray cluster. \"\n", " \"Proceed at your own risk!\")" ] diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 76ab79e4190..245619273ef 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -143,6 +143,18 @@ class Memory(EnvironmentVariable, type=int): varname = "MODIN_MEMORY" +class NPartitions(EnvironmentVariable, type=int): + """ + How many partitions to use by default + """ + + varname = "MODIN_NPARTITIONS" + + @classmethod + def _get_default(cls): + return CpuCount.get() + + class RayPlasmaDir(EnvironmentVariable, type=ExactStr): """ Path to Plasma storage for Ray diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 3b2adc4e533..068eaf022b1 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -17,6 +17,8 @@ from modin.error_message import ErrorMessage from modin.data_management.utils import compute_chunksize +from modin.config import NPartitions + from pandas.api.types import union_categoricals @@ -299,7 +301,7 @@ def broadcast_axis_partitions( elif lengths: num_splits = len(lengths) else: - num_splits = cls._compute_num_partitions() + num_splits = NPartitions.get() preprocessed_map_func = cls.preprocess_func(apply_func) left_partitions = cls.axis_partition(left, axis) right_partitions = None if right is None else cls.axis_partition(right, axis) @@ -567,7 +569,7 @@ def to_numpy(cls, partitions, **kwargs): @classmethod def from_pandas(cls, df, return_dims=False): """Return the partitions from Pandas DataFrame.""" - num_splits = cls._compute_num_partitions() + num_splits = NPartitions.get() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) parts = [ @@ -632,18 +634,6 @@ def get_indices(cls, axis, partitions, index_func=None): # TODO FIX INFORMATION LEAK!!!!1!!1!! return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx - @classmethod - def _compute_num_partitions(cls): - """Retrieve the default number of partitions currently. Will estimate the optimal no. of partitions in future. - - Returns - ------- - Number of partitions. - """ - from modin.pandas import DEFAULT_NPARTITIONS - - return DEFAULT_NPARTITIONS - @classmethod def _apply_func_to_list_of_partitions_broadcast( cls, func, partitions, other, **kwargs @@ -960,7 +950,7 @@ def binary_operation(cls, axis, left, func, right): [ left_partitions[i].apply( func, - num_splits=cls._compute_num_partitions(), + num_splits=NPartitions.get(), other_axis_partition=right_partitions[i], ) for i in range(len(left_partitions)) diff --git a/modin/engines/base/io/column_stores/column_store_dispatcher.py b/modin/engines/base/io/column_stores/column_store_dispatcher.py index 04cd05679d8..e166ea82ad5 100644 --- a/modin/engines/base/io/column_stores/column_store_dispatcher.py +++ b/modin/engines/base/io/column_stores/column_store_dispatcher.py @@ -16,22 +16,21 @@ from modin.data_management.utils import compute_chunksize from modin.engines.base.io.file_dispatcher import FileDispatcher +from modin.config import NPartitions class ColumnStoreDispatcher(FileDispatcher): @classmethod def call_deploy(cls, fname, col_partitions, **kwargs): - from modin.pandas import DEFAULT_NPARTITIONS - return np.array( [ cls.deploy( cls.parse, - DEFAULT_NPARTITIONS + 2, + NPartitions.get() + 2, dict( fname=fname, columns=cols, - num_splits=DEFAULT_NPARTITIONS, + num_splits=NPartitions.get(), **kwargs, ), ) @@ -57,8 +56,7 @@ def build_partition(cls, partition_ids, row_lengths, column_widths): @classmethod def build_index(cls, partition_ids): - from modin.pandas import DEFAULT_NPARTITIONS - + num_partitions = NPartitions.get() index_len = cls.materialize(partition_ids[-2][0]) if isinstance(index_len, int): index = pandas.RangeIndex(index_len) @@ -66,27 +64,26 @@ def build_index(cls, partition_ids): index = index_len index_len = len(index) index_chunksize = compute_chunksize( - pandas.DataFrame(index=index), DEFAULT_NPARTITIONS, axis=0 + pandas.DataFrame(index=index), num_partitions, axis=0 ) if index_chunksize > index_len: - row_lengths = [index_len] + [0 for _ in range(DEFAULT_NPARTITIONS - 1)] + row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)] else: row_lengths = [ index_chunksize - if i != DEFAULT_NPARTITIONS - 1 - else index_len - (index_chunksize * (DEFAULT_NPARTITIONS - 1)) - for i in range(DEFAULT_NPARTITIONS) + if i != num_partitions - 1 + else index_len - (index_chunksize * (num_partitions - 1)) + for i in range(num_partitions) ] return index, row_lengths @classmethod def build_columns(cls, columns): - from modin.pandas import DEFAULT_NPARTITIONS - + num_partitions = NPartitions.get() column_splits = ( - len(columns) // DEFAULT_NPARTITIONS - if len(columns) % DEFAULT_NPARTITIONS == 0 - else len(columns) // DEFAULT_NPARTITIONS + 1 + len(columns) // num_partitions + if len(columns) % num_partitions == 0 + else len(columns) // num_partitions + 1 ) col_partitions = [ columns[i : i + column_splits] diff --git a/modin/engines/base/io/sql/sql_dispatcher.py b/modin/engines/base/io/sql/sql_dispatcher.py index db046a8e5fc..67eb5c78638 100644 --- a/modin/engines/base/io/sql/sql_dispatcher.py +++ b/modin/engines/base/io/sql/sql_dispatcher.py @@ -17,6 +17,7 @@ import warnings from modin.engines.base.io.file_dispatcher import FileDispatcher +from modin.config import NPartitions class SQLDispatcher(FileDispatcher): @@ -62,9 +63,7 @@ def _read(cls, sql, con, index_col=None, **kwargs): "SELECT * FROM ({}) as foo LIMIT 0".format(sql), con, index_col=index_col ) cols_names = cols_names_df.columns - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() partition_ids = [] index_ids = [] dtype_ids = [] diff --git a/modin/engines/base/io/text/csv_dispatcher.py b/modin/engines/base/io/text/csv_dispatcher.py index 2d69c9b15bb..3c6d161d8b4 100644 --- a/modin/engines/base/io/text/csv_dispatcher.py +++ b/modin/engines/base/io/text/csv_dispatcher.py @@ -18,6 +18,8 @@ import csv import sys +from modin.config import NPartitions + class CSVDispatcher(TextFileDispatcher): @classmethod @@ -131,9 +133,7 @@ def _read(cls, filepath_or_buffer, **kwargs): index_ids = [] dtypes_ids = [] # Max number of partitions available - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # Metadata diff --git a/modin/engines/base/io/text/excel_dispatcher.py b/modin/engines/base/io/text/excel_dispatcher.py index 5bf23dd85e3..64ef376c76e 100644 --- a/modin/engines/base/io/text/excel_dispatcher.py +++ b/modin/engines/base/io/text/excel_dispatcher.py @@ -18,7 +18,7 @@ from modin.data_management.utils import compute_chunksize from modin.engines.base.io.text.text_file_dispatcher import TextFileDispatcher - +from modin.config import NPartitions EXCEL_READ_BLOCK_SIZE = 4096 @@ -98,9 +98,7 @@ def _read(cls, io, **kwargs): f = BytesIO(f.read()) total_bytes = cls.file_size(f) - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() # Read some bytes from the sheet so we can extract the XML header and first # line. We need to make sure we get the first line of the data as well # because that is where the column names are. The header information will diff --git a/modin/engines/base/io/text/fwf_dispatcher.py b/modin/engines/base/io/text/fwf_dispatcher.py index 548028390ce..630c776d6fd 100644 --- a/modin/engines/base/io/text/fwf_dispatcher.py +++ b/modin/engines/base/io/text/fwf_dispatcher.py @@ -18,6 +18,8 @@ from csv import QUOTE_NONE import sys +from modin.config import NPartitions + class FWFDispatcher(TextFileDispatcher): @classmethod @@ -124,9 +126,7 @@ def read(cls, filepath_or_buffer, **kwargs): index_ids = [] dtypes_ids = [] # Max number of partitions available - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) # Metadata diff --git a/modin/engines/base/io/text/json_dispatcher.py b/modin/engines/base/io/text/json_dispatcher.py index 1e130b550e5..75c04340310 100644 --- a/modin/engines/base/io/text/json_dispatcher.py +++ b/modin/engines/base/io/text/json_dispatcher.py @@ -18,6 +18,8 @@ import numpy as np from csv import QUOTE_NONE +from modin.config import NPartitions + class JSONDispatcher(TextFileDispatcher): @classmethod @@ -38,9 +40,7 @@ def _read(cls, path_or_buf, **kwargs): empty_pd_df = pandas.DataFrame(columns=columns) with cls.file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() num_splits = min(len(columns), num_partitions) partition_ids = [] diff --git a/modin/engines/base/io/text/text_file_dispatcher.py b/modin/engines/base/io/text/text_file_dispatcher.py index c719a9462a0..81ba48521d6 100644 --- a/modin/engines/base/io/text/text_file_dispatcher.py +++ b/modin/engines/base/io/text/text_file_dispatcher.py @@ -17,6 +17,8 @@ import io import os +from modin.config import NPartitions + class TextFileDispatcher(FileDispatcher): @classmethod @@ -137,7 +139,7 @@ def partitioned_file( f: file to be partitioned num_partitions: int, optional For what number of partitions split a file. - If not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` + If not specified grabs the value from `modin.config.NPartitions.get()` nrows: int, optional Number of rows of file to read. skiprows: array or callable, optional @@ -153,9 +155,7 @@ def partitioned_file( beginning and the end offsets of the current chunk. """ if num_partitions is None: - from modin.pandas import DEFAULT_NPARTITIONS - - num_partitions = DEFAULT_NPARTITIONS + num_partitions = NPartitions.get() result = [] file_size = cls.file_size(f) diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py index 38b8170445f..38bdb88933f 100644 --- a/modin/experimental/engines/pandas_on_ray/io_exp.py +++ b/modin/experimental/engines/pandas_on_ray/io_exp.py @@ -18,6 +18,7 @@ from modin.engines.ray.pandas_on_ray.io import PandasOnRayIO from modin.backends.pandas.parsers import _split_result_for_readers from modin.engines.ray.pandas_on_ray.frame.partition import PandasOnRayFramePartition +from modin.config import NPartitions import ray @@ -117,7 +118,7 @@ def read_sql( ) # starts the distributed alternative cols_names, query = get_query_info(sql, con, partition_column) - num_parts = min(cls.frame_mgr_cls._compute_num_partitions(), max_sessions) + num_parts = min(NPartitions.get(), max_sessions) num_splits = min(len(cols_names), num_parts) diff = (upper_bound - lower_bound) + 1 min_size = diff // num_parts diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 1b5a18f7654..cc8433928bb 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -90,9 +90,6 @@ # Set this so that Pandas doesn't try to multithread by itself os.environ["OMP_NUM_THREADS"] = "1" -DEFAULT_NPARTITIONS = 4 -num_cpus = 1 - _is_first_update = {} dask_client = None @@ -102,11 +99,10 @@ def _update_engine(publisher: Parameter): - global DEFAULT_NPARTITIONS, dask_client, num_cpus + global dask_client from modin.config import Backend, CpuCount if publisher.get() == "Ray": - import ray from modin.engines.ray.utils import initialize_ray # With OmniSci backend there is only a single worker per node @@ -116,21 +112,15 @@ def _update_engine(publisher: Parameter): os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count()) if _is_first_update.get("Ray", True): initialize_ray() - num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": - from distributed.client import get_client - if _is_first_update.get("Dask", True): from modin.engines.dask.utils import initialize_dask initialize_dask() - num_cpus = len(get_client().ncores()) - elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() - remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport @@ -152,8 +142,6 @@ def init_remote_ray(partition): import modin.data_management.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_backends("Ray", Backend.get()) - - num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection @@ -163,7 +151,6 @@ def init_remote_ray(partition): raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) _is_first_update[publisher.get()] = False - DEFAULT_NPARTITIONS = max(4, int(num_cpus)) from .. import __version__ @@ -321,7 +308,6 @@ def init_remote_ray(partition): "value_counts", "datetime", "NamedAgg", - "DEFAULT_NPARTITIONS", ] del pandas, Engine, Parameter diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index a4449798c12..b5bd07cb92c 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -24,8 +24,9 @@ test_data, create_test_dfs, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 552a9fa7480..42a89b07894 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -38,8 +38,9 @@ test_data_diff_dtype, modin_df_almost_equals_pandas, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index db81111de73..36a318ab25d 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -36,8 +36,9 @@ create_test_dfs, eval_general, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index d35687a8123..3b9bfce6cf7 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -29,8 +29,9 @@ create_test_dfs, test_data, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index c281f2b9db9..d2ecad0e4b8 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -34,8 +34,9 @@ generate_multiindex, eval_general, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index d326b7adb34..b7866ee18df 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -46,8 +46,9 @@ eval_general, create_test_dfs, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py index dae74e07b52..70338267ff9 100644 --- a/modin/pandas/test/dataframe/test_reduction.py +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -35,8 +35,9 @@ generate_multiindex, test_data_diff_dtype, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index 651feab1e40..23567dc1345 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -34,8 +34,9 @@ udf_func_keys, test_data, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index ff8ce9c5117..e64202c18dc 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -40,8 +40,9 @@ create_test_dfs, test_data_diff_dtype, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index abb907f639b..09a5dced43a 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -45,7 +45,6 @@ def test_top_level_api_equality(): ] ignore_modin = [ - "DEFAULT_NPARTITIONS", "iterator", "series", "accessor", diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 3bd6a27dd76..061629e8fef 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -24,8 +24,9 @@ generate_none_dfs, create_test_dfs, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) def test_df_concat(): diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index ab500c24a09..f29d6a2cd44 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -27,8 +27,9 @@ modin_df_almost_equals_pandas, generate_multiindex, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) def modin_groupby_equals_pandas(modin_groupby, pandas_groupby): diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index b2c07704537..f89af50c910 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -52,8 +52,9 @@ import modin.pandas as pd else: import modin.experimental.pandas as pd +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) DATASET_SIZE_DICT = { "Small": 64, diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py index 2dd935f8993..32881d4cf7c 100644 --- a/modin/pandas/test/test_rolling.py +++ b/modin/pandas/test/test_rolling.py @@ -17,8 +17,9 @@ import modin.pandas as pd from .utils import df_equals, test_data_values, test_data_keys +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) def create_test_series(vals): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index c566b4cd0ca..5fead9eb0de 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -67,8 +67,9 @@ generate_multiindex, test_data_diff_dtype, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) # Force matplotlib to not use any Xwindows backend. matplotlib.use("Agg") diff --git a/modin/test/__init__.py b/modin/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/test/backends/base/test_internals.py b/modin/test/backends/base/test_internals.py index 011b64564b7..be26075ae72 100644 --- a/modin/test/backends/base/test_internals.py +++ b/modin/test/backends/base/test_internals.py @@ -11,8 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import modin.pandas as pd - import pandas import pytest @@ -21,8 +19,9 @@ create_test_dfs, df_equals, ) +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) @pytest.mark.parametrize("axis", [0, 1]) diff --git a/modin/test/backends/pandas/test_internals.py b/modin/test/backends/pandas/test_internals.py index cb544aed471..19b843ca33c 100644 --- a/modin/test/backends/pandas/test_internals.py +++ b/modin/test/backends/pandas/test_internals.py @@ -13,8 +13,9 @@ import modin.pandas as pd from modin.pandas.test.utils import create_test_dfs +from modin.config import NPartitions -pd.DEFAULT_NPARTITIONS = 4 +NPartitions.put(4) def test_aligning_blocks(): diff --git a/modin/test/test_envvar_npartitions.py b/modin/test/test_envvar_npartitions.py new file mode 100644 index 00000000000..e2d0db8570a --- /dev/null +++ b/modin/test/test_envvar_npartitions.py @@ -0,0 +1,46 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import modin.pandas as pd +import numpy as np +import pytest + +from modin.config import NPartitions + + +@pytest.mark.parametrize("num_partitions", [2, 4, 6, 8, 10]) +def test_set_npartitions(num_partitions): + NPartitions.put(num_partitions) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) + df = pd.DataFrame(data) + part_shape = df._query_compiler._modin_frame._partitions.shape + assert part_shape[0] == num_partitions and part_shape[1] == min(num_partitions, 8) + + +@pytest.mark.parametrize("left_num_partitions", [2, 4, 6, 8, 10]) +@pytest.mark.parametrize("right_num_partitions", [2, 4, 6, 8, 10]) +def test_runtime_change_npartitions(left_num_partitions, right_num_partitions): + NPartitions.put(left_num_partitions) + data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8)) + left_df = pd.DataFrame(data) + part_shape = left_df._query_compiler._modin_frame._partitions.shape + assert part_shape[0] == left_num_partitions and part_shape[1] == min( + left_num_partitions, 8 + ) + + NPartitions.put(right_num_partitions) + right_df = pd.DataFrame(data) + part_shape = right_df._query_compiler._modin_frame._partitions.shape + assert part_shape[0] == right_num_partitions and part_shape[1] == min( + right_num_partitions, 8 + )