Azure · wangchao1230 · Oct 26, 2022 · Oct 17, 2022 · Oct 17, 2022 · Oct 17, 2022
@@ -35,6 +35,11 @@ class ParallelComponentSchema(ComponentSchema):
     mini_batch_size = fields.Str(
         metadata={"description": "The The batch size of current job."},
     )
+    partition_keys = fields.List(
+        fields.Str(),
+        metadata={"description": "The keys used to partition input data into mini-batches"}
+    )
+
     input_data = fields.Str()
     retry_settings = NestedField(RetrySettingsSchema, unknown=INCLUDE)
     max_concurrency_per_instance = fields.Integer(

@@ -28,6 +28,10 @@ class ParameterizedParallelSchema(PathAwareSchema):
     mini_batch_size = fields.Str(
         metadata={"description": "The batch size of current job."},
     )
+    partition_keys = fields.List(
+        fields.Str(),
+        metadata={"description": "The keys used to partition input data into mini-batches"}
+    )
     input_data = fields.Str()
     resources = NestedField(JobResourceConfigurationSchema)
     retry_settings = NestedField(RetrySettingsSchema, unknown=INCLUDE)

@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Dict, Union
+from typing import Dict, Union, List
 
 from azure.ai.ml._restclient.v2022_02_01_preview.models import AmlToken, ManagedIdentity
 from azure.ai.ml.constants._component import ComponentSource
@@ -31,6 +31,7 @@ def parallel_run_function(
     mini_batch_error_threshold: int = None,
     task: RunFunction = None,
     mini_batch_size: str = None,
+    partition_keys: List = None,
     input_data: str = None,
     inputs: Dict = None,
     outputs: Dict = None,
@@ -136,6 +137,12 @@ def parallel_run_function(
         (optional, default value is 10 files for FileDataset and 1MB for TabularDataset.) This value could be set
         through PipelineParameter.
     :type mini_batch_size: str
+    :param partition_keys: The keys used to partition dataset into mini-batches.
+        If specified, the data with the same key will be partitioned into the same mini-batch.
+        If both partition_keys and mini_batch_size are specified, error would be raised.
+        The input(s) must be partitioned dataset(s),
+        and the partition_keys must be a subset of the keys of every input dataset for this to work.
+    :type partition_keys: List
     :param input_data: The input data.
     :type input_data: str
     :param inputs: a dict of inputs used by this parallel.
@@ -190,6 +197,7 @@ def parallel_run_function(
             mini_batch_error_threshold=mini_batch_error_threshold,
             task=task,
             mini_batch_size=mini_batch_size,
+            partition_keys=partition_keys,
             input_data=input_data,
             _source=ComponentSource.BUILDER,
             is_deterministic=is_deterministic,

@@ -4,7 +4,7 @@
 
 import os
 import re
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, List
 
 from marshmallow import Schema
 
@@ -53,6 +53,12 @@ class ParallelComponent(Component, ParameterizedParallel):  # pylint: disable=to
         (optional, default value is 10 files for FileDataset and 1MB for TabularDataset.) This value could be set
         through PipelineParameter.
     :type mini_batch_size: str
+    :param partition_keys:  The keys used to partition dataset into mini-batches.
+        If specified, the data with the same key will be partitioned into the same mini-batch.
+        If both partition_keys and mini_batch_size are specified, error would be raised.
+        The input(s) must be partitioned dataset(s),
+        and the partition_keys must be a subset of the keys of every input dataset for this to work.
+    :type partition_keys: list
     :param input_data: The input data.
     :type input_data: str
     :param resources: Compute Resource configuration for the component.
@@ -86,6 +92,7 @@ def __init__(
         mini_batch_error_threshold: int = None,
         task: ParallelTask = None,
         mini_batch_size: str = None,
+        partition_keys: List = None,
         input_data: str = None,
         resources: JobResourceConfiguration = None,
         inputs: Dict = None,
@@ -116,6 +123,7 @@ def __init__(
         # and fill in later with job defaults.
         self.task = task
         self.mini_batch_size = mini_batch_size
+        self.partition_keys = partition_keys
         self.input_data = input_data
         self.retry_settings = retry_settings
         self.logging_level = logging_level
@@ -136,6 +144,14 @@ def __init__(
         self.instance_count = instance_count
         self.code = code
 
+        if self.mini_batch_size is not None and self.partition_keys is not None:
+            msg = "mini_batch_size and partition_keys are mutually exclusive"
+            raise ValidationException(
+                message=msg,
+                target=ErrorTarget.COMPONENT,
+                no_personal_data_message=msg,
+                error_category=ErrorCategory.USER_ERROR,
+            )
         if self.mini_batch_size is not None:
             # Convert str to int.
             pattern = re.compile(r"^\d+([kKmMgG][bB])*$")

@@ -54,6 +54,8 @@ class ParallelJob(Job, ParameterizedParallel, JobIOMixin):
     :type task: ParallelTask
     :param mini_batch_size: The mini batch size.
     :type mini_batch_size: str
+    :param partition_keys: The partition keys.
+    :type partition_keys: list
     :param input_data: The input data.
     :type input_data: str
     :param inputs: Inputs of the job.

@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 import logging
-from typing import Dict, Union
+from typing import Dict, Union, List
 
 from ..job_resource_configuration import JobResourceConfiguration
 from .parallel_task import ParallelTask
@@ -47,10 +47,12 @@ def __init__(
         input_data: str = None,
         task: ParallelTask = None,
         mini_batch_size: int = None,
+        partition_keys: List = None,
         resources: Union[dict, JobResourceConfiguration] = None,
         environment_variables: Dict = None,
     ):
         self.mini_batch_size = mini_batch_size
+        self.partition_keys = partition_keys
         self.task = task
         self.retry_settings = retry_settings
         self.input_data = input_data