apache · zhengruifeng · May 8, 2024 · dongjoon-hyun · May 9, 2024
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -467,7 +467,9 @@ message Sample {
   // (Optional) Whether to sample with replacement.
   optional bool with_replacement = 4;
 
-  // (Optional) The random seed.
+  // (Required) The random seed.
+  // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
+  // however, still keep it 'optional' here for backward compatibility.
   optional int64 seed = 5;
 
   // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
@@ -687,7 +689,9 @@ message StatSampleBy {
   // If a stratum is not specified, we treat its fraction as zero.
   repeated Fraction fractions = 3;
 
-  // (Optional) The random seed.
+  // (Required) The random seed.
+  // This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
+  // however, still keep it 'optional' here for backward compatibility.
   optional int64 seed = 5;
 
   message Fraction {

diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
@@ -717,7 +717,7 @@ def __init__(
         lower_bound: float,
         upper_bound: float,
         with_replacement: bool,
-        seed: Optional[int],
+        seed: int,
         deterministic_order: bool = False,
     ) -> None:
         super().__init__(child)
@@ -734,8 +734,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation:
         plan.sample.lower_bound = self.lower_bound
         plan.sample.upper_bound = self.upper_bound
         plan.sample.with_replacement = self.with_replacement
-        if self.seed is not None:
-            plan.sample.seed = self.seed
+        plan.sample.seed = self.seed
         plan.sample.deterministic_order = self.deterministic_order
         return plan
 
@@ -1526,7 +1525,7 @@ def __init__(
         child: Optional["LogicalPlan"],
         col: Column,
         fractions: Sequence[Tuple[Column, float]],
-        seed: Optional[int],
+        seed: int,
     ) -> None:
         super().__init__(child)
 
@@ -1554,8 +1553,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation:
                 fraction.stratum.CopyFrom(k.to_plan(session).literal)
                 fraction.fraction = float(v)
                 plan.sample_by.fractions.append(fraction)
-        if self._seed is not None:
-            plan.sample_by.seed = self._seed
+        plan.sample_by.seed = self._seed
         return plan
 
 

diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -1865,7 +1865,10 @@ class Sample(google.protobuf.message.Message):
     with_replacement: builtins.bool
     """(Optional) Whether to sample with replacement."""
     seed: builtins.int
-    """(Optional) The random seed."""
+    """(Required) The random seed.
+    This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
+    however, still keep it 'optional' here for backward compatibility.
+    """
     deterministic_order: builtins.bool
     """(Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
     This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the
@@ -2545,7 +2548,10 @@ class StatSampleBy(google.protobuf.message.Message):
         If a stratum is not specified, we treat its fraction as zero.
         """
     seed: builtins.int
-    """(Optional) The random seed."""
+    """(Required) The random seed.
+    This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
+    however, still keep it 'optional' here for backward compatibility.
+    """
     def __init__(
         self,
         *,