modin-project · anmyachev · Apr 30, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
@@ -13,6 +13,8 @@
 
 """Module houses class that implements ``BaseIO`` using Dask as an execution engine."""
 
+import numpy as np
+import pandas
 from distributed.client import default_client
 
 from modin.core.execution.dask.common import DaskWrapper
@@ -68,6 +70,7 @@ class PandasOnDaskIO(BaseIO):
     """The class implements interface in ``BaseIO`` using Dask as an execution engine."""
 
     frame_cls = PandasOnDaskDataframe
+    frame_partition_cls = PandasOnDaskDataframePartition
     query_compiler_cls = PandasQueryCompiler
     build_args = dict(
         frame_cls=PandasOnDaskDataframe,
@@ -188,3 +191,66 @@ def df_to_series(df):
             partitions = [client.submit(df_to_series, part) for part in partitions]
 
         return from_delayed(partitions)
+
+    @classmethod
+    def from_map(cls, func, iterable, *args, **kwargs):
+        """
+        Create a Modin `query_compiler` from a map function.
+
+        This method will construct a Modin `query_compiler` split by row partitions.
+        The number of row partitions matches the number of elements in the iterable object.
+
+        Parameters
+        ----------
+        func : callable
+            Function to map across the iterable object.
+        iterable : Iterable
+            An iterable object.
+        *args : tuple
+            Positional arguments to pass in `func`.
+        **kwargs : dict
+            Keyword arguments to pass in `func`.
+
+        Returns
+        -------
+        BaseQueryCompiler
+            QueryCompiler containing data returned by map function.
+        """
+        func = cls.frame_cls._partition_mgr_cls.preprocess_func(func)
+        client = default_client()
+        partitions = np.array(
+            [
+                [
+                    cls.frame_partition_cls(
+                        client.submit(deploy_map_func, func, obj, *args, **kwargs)
+                    )
+                ]
+                for obj in iterable
+            ]
+        )
+        return cls.query_compiler_cls(cls.frame_cls(partitions))
+
+
+def deploy_map_func(func, obj, *args, **kwargs):  # pragma: no cover
+    """
+    Deploy a func to apply to an object.
+
+    Parameters
+    ----------
+    func : callable
+        Function to map across the iterable object.
+    obj : object
+        An object to apply a function to.
+    *args : tuple
+        Positional arguments to pass in `func`.
+    **kwargs : dict
+        Keyword arguments to pass in `func`.
+
+    Returns
+    -------
+    pandas.DataFrame
+    """
+    result = func(obj, *args, **kwargs)
+    if not isinstance(result, pandas.DataFrame):
+        result = pandas.DataFrame(result)
+    return result
@@ -191,6 +191,11 @@ def from_ray(cls, ray_obj):
     def from_dask(cls, dask_obj):
         return cls.get_factory()._from_dask(dask_obj)
 
+    @classmethod
+    @_inherit_docstrings(factories.BaseFactory._from_map)
+    def from_map(cls, func, iterable, *args, **kwargs):
+        return cls.get_factory()._from_map(func, iterable, *args, **kwargs)
+
     @classmethod
     @_inherit_docstrings(factories.BaseFactory._read_parquet)
     def read_parquet(cls, **kwargs):

@@ -221,6 +221,32 @@ def _from_ray(cls, ray_obj):
     def _from_dask(cls, dask_obj):
         return cls.io_cls.from_dask(dask_obj)
 
+    @classmethod
+    def _from_map(cls, func, iterable, *args, **kwargs):
+        """
+        Create a Modin `query_compiler` from a map function.
+
+        This method will construct a Modin `query_compiler` split by row partitions.
+        The number of row partitions matches the number of elements in the iterable object.
+
+        Parameters
+        ----------
+        func : callable
+            Function to map across the iterable object.
+        iterable : Iterable
+            An iterable object.
+        *args : tuple
+            Positional arguments to pass in `func`.
+        **kwargs : dict
+            Keyword arguments to pass in `func`.
+
+        Returns
+        -------
+        BaseQueryCompiler
+            QueryCompiler containing data returned by map function.
+        """
+        return cls.io_cls.from_map(func, iterable, *args, **kwargs)
+
     @classmethod
     @doc(
         _doc_io_method_template,

@@ -15,7 +15,9 @@
 
 import io
 
+import numpy as np
 import pandas
+import ray
 from pandas.io.common import get_handle, stringify_path
 from ray.data import from_pandas_refs
 
@@ -68,6 +70,7 @@ class PandasOnRayIO(RayIO):
     """Factory providing methods for performing I/O operations using pandas as storage format on Ray as engine."""
 
     frame_cls = PandasOnRayDataframe
+    frame_partition_cls = PandasOnRayDataframePartition
     query_compiler_cls = PandasQueryCompiler
     build_args = dict(
         frame_partition_cls=PandasOnRayDataframePartition,
@@ -302,3 +305,66 @@ def to_ray(cls, modin_obj):
         """
         parts = unwrap_partitions(modin_obj, axis=0)
         return from_pandas_refs(parts)
+
+    @classmethod
+    def from_map(cls, func, iterable, *args, **kwargs):
+        """
+        Create a Modin `query_compiler` from a map function.
+
+        This method will construct a Modin `query_compiler` split by row partitions.
+        The number of row partitions matches the number of elements in the iterable object.
+
+        Parameters
+        ----------
+        func : callable
+            Function to map across the iterable object.
+        iterable : Iterable
+            An iterable object.
+        *args : tuple
+            Positional arguments to pass in `func`.
+        **kwargs : dict
+            Keyword arguments to pass in `func`.
+
+        Returns
+        -------
+        BaseQueryCompiler
+            QueryCompiler containing data returned by map function.
+        """
+        func = cls.frame_cls._partition_mgr_cls.preprocess_func(func)
+        partitions = np.array(
+            [
+                [
+                    cls.frame_partition_cls(
+                        deploy_map_func.remote(func, obj, *args, **kwargs)
+                    )
+                ]
+                for obj in iterable
+            ]
+        )
+        return cls.query_compiler_cls(cls.frame_cls(partitions))
+
+
+@ray.remote
+def deploy_map_func(func, obj, *args, **kwargs):  # pragma: no cover
+    """
+    Deploy a func to apply to an object.
+
+    Parameters
+    ----------
+    func : callable
+        Function to map across the iterable object.
+    obj : object
+        An object to apply a function to.
+    *args : tuple
+        Positional arguments to pass in `func`.
+    **kwargs : dict
+        Keyword arguments to pass in `func`.
+
+    Returns
+    -------
+    pandas.DataFrame
+    """
+    result = func(obj, *args, **kwargs)
+    if not isinstance(result, pandas.DataFrame):
+        result = pandas.DataFrame(result)
+    return result
@@ -15,7 +15,9 @@
 
 import io
 
+import numpy as np
 import pandas
+import unidist
 from pandas.io.common import get_handle, stringify_path
 
 from modin.core.execution.unidist.common import SignalActor, UnidistWrapper
@@ -62,6 +64,7 @@ class PandasOnUnidistIO(UnidistIO):
     """Factory providing methods for performing I/O operations using pandas as storage format on unidist as engine."""
 
     frame_cls = PandasOnUnidistDataframe
+    frame_partition_cls = PandasOnUnidistDataframePartition
     query_compiler_cls = PandasQueryCompiler
     build_args = dict(
         frame_partition_cls=PandasOnUnidistDataframePartition,
@@ -258,3 +261,66 @@ def func(df, **kw):  # pragma: no cover
         UnidistWrapper.materialize(
             [part.list_of_blocks[0] for row in result for part in row]
         )
+
+    @classmethod
+    def from_map(cls, func, iterable, *args, **kwargs):
+        """
+        Create a Modin `query_compiler` from a map function.
+
+        This method will construct a Modin `query_compiler` split by row partitions.
+        The number of row partitions matches the number of elements in the iterable object.
+
+        Parameters
+        ----------
+        func : callable
+            Function to map across the iterable object.
+        iterable : Iterable
+            An iterable object.
+        *args : tuple
+            Positional arguments to pass in `func`.
+        **kwargs : dict
+            Keyword arguments to pass in `func`.
+
+        Returns
+        -------
+        BaseQueryCompiler
+            QueryCompiler containing data returned by map function.
+        """
+        func = cls.frame_cls._partition_mgr_cls.preprocess_func(func)
+        partitions = np.array(
+            [
+                [
+                    cls.frame_partition_cls(
+                        deploy_map_func.remote(func, obj, *args, **kwargs)
+                    )
+                ]
+                for obj in iterable
+            ]
+        )
+        return cls.query_compiler_cls(cls.frame_cls(partitions))
+
+
+@unidist.remote
+def deploy_map_func(func, obj, *args, **kwargs):  # pragma: no cover
+    """
+    Deploy a func to apply to an object.
+
+    Parameters
+    ----------
+    func : callable
+        Function to map across the iterable object.
+    obj : object
+        An object to apply a function to.
+    *args : tuple
+        Positional arguments to pass in `func`.
+    **kwargs : dict
+        Keyword arguments to pass in `func`.
+
+    Returns
+    -------
+    pandas.DataFrame
+    """
+    result = func(obj, *args, **kwargs)
+    if not isinstance(result, pandas.DataFrame):
+        result = pandas.DataFrame(result)
+    return result
@@ -164,6 +164,34 @@
             "Modin DataFrame can only be converted to a Dask DataFrame if Modin uses a Dask engine."
         )
 
+    @classmethod
+    def from_map(cls, func, iterable, *args, **kwargs):
+        """
+        Create a Modin `query_compiler` from a map function.
+
+        This method will construct a Modin `query_compiler` split by row partitions.
+        The number of row partitions matches the number of elements in the iterable object.
+
+        Parameters
+        ----------
+        func : callable
+            Function to map across the iterable object.
+        iterable : Iterable
+            An iterable object.
+        *args : tuple
+            Positional arguments to pass in `func`.
+        **kwargs : dict
+            Keyword arguments to pass in `func`.
+
+        Returns
+        -------
+        BaseQueryCompiler
+            QueryCompiler containing data returned by map function.
+        """
+        raise RuntimeError(
+            "Modin DataFrame can only be created if Modin uses Ray, Dask or MPI engine."
+        )
+
     @classmethod
     @_inherit_docstrings(pandas.read_parquet, apilink="pandas.read_parquet")
     @doc(

@@ -1109,6 +1109,36 @@ def from_dask(dask_obj) -> DataFrame:
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dask(dask_obj))
 
 
+def from_map(func, iterable, *args, **kwargs) -> DataFrame:
+    """
+    Create a Modin DataFrame from map function applied to an iterable object.
+
+    This method will construct a Modin DataFrame split by row partitions.
+    The number of row partitions matches the number of elements in the iterable object.
+
+    Parameters
+    ----------
+    func : callable
+        Function to map across the iterable object.
+    iterable : Iterable
+        An iterable object.
+    *args : tuple
+        Positional arguments to pass in `func`.
+    **kwargs : dict
+        Keyword arguments to pass in `func`.
+
+    Returns
+    -------
+    DataFrame
+        A new Modin DataFrame object.
+    """
+    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
+
+    return ModinObjects.DataFrame(
+        query_compiler=FactoryDispatcher.from_map(func, iterable, *args, *kwargs)
+    )
+
+
 def to_pandas(modin_obj: SupportsPublicToPandas) -> DataFrame | Series:
     """
     Convert a Modin DataFrame/Series to a pandas DataFrame/Series.