Skip to content

Commit

Permalink
[SPARK-48227][PYTHON][DOC] Document the requirement of seed in protos
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Document the requirement of seed in protos

### Why are the changes needed?
the seed should be set at client side

document it to avoid cases like #46456

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #46518 from zhengruifeng/doc_random.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
zhengruifeng authored and dongjoon-hyun committed May 9, 2024
1 parent 71f0eda commit 012d19d
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ message Sample {
// (Optional) Whether to sample with replacement.
optional bool with_replacement = 4;

// (Optional) The random seed.
// (Required) The random seed.
// This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
// however, still keep it 'optional' here for backward compatibility.
optional int64 seed = 5;

// (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
Expand Down Expand Up @@ -687,7 +689,9 @@ message StatSampleBy {
// If a stratum is not specified, we treat its fraction as zero.
repeated Fraction fractions = 3;

// (Optional) The random seed.
// (Required) The random seed.
// This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
// however, still keep it 'optional' here for backward compatibility.
optional int64 seed = 5;

message Fraction {
Expand Down
10 changes: 4 additions & 6 deletions python/pyspark/sql/connect/plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ def __init__(
lower_bound: float,
upper_bound: float,
with_replacement: bool,
seed: Optional[int],
seed: int,
deterministic_order: bool = False,
) -> None:
super().__init__(child)
Expand All @@ -734,8 +734,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation:
plan.sample.lower_bound = self.lower_bound
plan.sample.upper_bound = self.upper_bound
plan.sample.with_replacement = self.with_replacement
if self.seed is not None:
plan.sample.seed = self.seed
plan.sample.seed = self.seed
plan.sample.deterministic_order = self.deterministic_order
return plan

Expand Down Expand Up @@ -1526,7 +1525,7 @@ def __init__(
child: Optional["LogicalPlan"],
col: Column,
fractions: Sequence[Tuple[Column, float]],
seed: Optional[int],
seed: int,
) -> None:
super().__init__(child)

Expand Down Expand Up @@ -1554,8 +1553,7 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation:
fraction.stratum.CopyFrom(k.to_plan(session).literal)
fraction.fraction = float(v)
plan.sample_by.fractions.append(fraction)
if self._seed is not None:
plan.sample_by.seed = self._seed
plan.sample_by.seed = self._seed
return plan


Expand Down
10 changes: 8 additions & 2 deletions python/pyspark/sql/connect/proto/relations_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1865,7 +1865,10 @@ class Sample(google.protobuf.message.Message):
with_replacement: builtins.bool
"""(Optional) Whether to sample with replacement."""
seed: builtins.int
"""(Optional) The random seed."""
"""(Required) The random seed.
This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
however, still keep it 'optional' here for backward compatibility.
"""
deterministic_order: builtins.bool
"""(Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it.
This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the
Expand Down Expand Up @@ -2545,7 +2548,10 @@ class StatSampleBy(google.protobuf.message.Message):
If a stratum is not specified, we treat its fraction as zero.
"""
seed: builtins.int
"""(Optional) The random seed."""
"""(Required) The random seed.
This filed is required to avoid generate mutable dataframes (see SPARK-48184 for details),
however, still keep it 'optional' here for backward compatibility.
"""
def __init__(
self,
*,
Expand Down

0 comments on commit 012d19d

Please sign in to comment.