diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto index 3882b2e853967..0b3c9d4253e8c 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -467,7 +467,9 @@ message Sample { // (Optional) Whether to sample with replacement. optional bool with_replacement = 4; - // (Optional) The random seed. + // (Required) The random seed. + // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. @@ -687,7 +689,9 @@ message StatSampleBy { // If a stratum is not specified, we treat its fraction as zero. repeated Fraction fractions = 3; - // (Optional) The random seed. + // (Required) The random seed. + // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + // however, still keep it 'optional' here for backward compatibility. optional int64 seed = 5; message Fraction { diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 4ac4946745f5e..3d3303fb15c57 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -717,7 +717,7 @@ def __init__( lower_bound: float, upper_bound: float, with_replacement: bool, - seed: Optional[int], + seed: int, deterministic_order: bool = False, ) -> None: super().__init__(child) @@ -734,8 +734,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: plan.sample.lower_bound = self.lower_bound plan.sample.upper_bound = self.upper_bound plan.sample.with_replacement = self.with_replacement - if self.seed is not None: - plan.sample.seed = self.seed + plan.sample.seed = self.seed plan.sample.deterministic_order = self.deterministic_order return plan @@ -1526,7 +1525,7 @@ def __init__( child: Optional["LogicalPlan"], col: Column, fractions: Sequence[Tuple[Column, float]], - seed: Optional[int], + seed: int, ) -> None: super().__init__(child) @@ -1554,8 +1553,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: fraction.stratum.CopyFrom(k.to_plan(session).literal) fraction.fraction = float(v) plan.sample_by.fractions.append(fraction) - if self._seed is not None: - plan.sample_by.seed = self._seed + plan.sample_by.seed = self._seed return plan diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 5dfb47da67a97..9b6f4b43544f2 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -1865,7 +1865,10 @@ class Sample(google.protobuf.message.Message): with_replacement: builtins.bool """(Optional) Whether to sample with replacement.""" seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ deterministic_order: builtins.bool """(Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the @@ -2545,7 +2548,10 @@ class StatSampleBy(google.protobuf.message.Message): If a stratum is not specified, we treat its fraction as zero. """ seed: builtins.int - """(Optional) The random seed.""" + """(Required) The random seed. + This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details), + however, still keep it 'optional' here for backward compatibility. + """ def __init__( self, *,