-
-
Notifications
You must be signed in to change notification settings - Fork 77
/
run_without_frameworks.py
127 lines (107 loc) · 4.78 KB
/
run_without_frameworks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Copyright 2022 OpenMined.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Demo of running PipelineDP locally, without any external data processing framework"""
from absl import app
from absl import flags
import pipeline_dp
from common_utils import parse_file, write_to_file
FLAGS = flags.FLAGS
flags.DEFINE_string('input_file', None, 'The file with the movie view data')
flags.DEFINE_string('output_file', None, 'Output file')
flags.DEFINE_bool(
'pld_accounting', False, 'If false Naive budget accounting '
'is used, if true PLD accounting')
flags.DEFINE_integer('pre_threshold', None,
'Pre threshold which is used in the DP aggregation')
def main(unused_argv):
# Here, we use a local backend for computations. This does not depend on
# any pipeline framework and it is implemented in pure Python in
# PipelineDP. It keeps all data in memory and is not optimized for large data.
# For datasets smaller than ~tens of megabytes, local execution without any
# framework is faster than local mode with Beam or Spark.
backend = pipeline_dp.LocalBackend()
# Define the privacy budget available for our computation.
if FLAGS.pld_accounting:
budget_accountant = pipeline_dp.PLDBudgetAccountant(total_epsilon=1,
total_delta=1e-6)
else:
budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
total_delta=1e-6)
# Load and parse input data
movie_views = parse_file(FLAGS.input_file)
# Create a DPEngine instance.
dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)
# Define metrics to compute. We can compute multiple metrics at once.
metrics = [
# We can compute multiple metrics at once.
pipeline_dp.Metrics.COUNT,
pipeline_dp.Metrics.SUM,
pipeline_dp.Metrics.PRIVACY_ID_COUNT
]
if not FLAGS.pld_accounting:
# PLD accounting does not yet support PERCENTILE computations.
metrics.extend([
pipeline_dp.Metrics.PERCENTILE(50),
pipeline_dp.Metrics.PERCENTILE(90),
pipeline_dp.Metrics.PERCENTILE(99)
])
params = pipeline_dp.AggregateParams(
metrics=metrics,
# Add Gaussian noise to anonymize values.
noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
# Limits to how much one user can contribute:
# .. at most two movies rated per user
max_partitions_contributed=2,
# .. at most one rating for each movie
max_contributions_per_partition=1,
# .. with minimal rating of "1"
min_value=1,
# .. and maximum rating of "5"
max_value=5)
if FLAGS.pre_threshold:
params.pre_threshold = FLAGS.pre_threshold
# Specify how to extract privacy_id, partition_key and value from an
# element of movie_views.
data_extractors = pipeline_dp.DataExtractors(
partition_extractor=lambda mv: mv.movie_id,
privacy_id_extractor=lambda mv: mv.user_id,
value_extractor=lambda mv: mv.rating)
# Create the Explain Computation report object for passing it into
# DPEngine.aggregate().
explain_computation_report = pipeline_dp.ExplainComputationReport()
# Create a computational graph for the aggregation.
# All computations are lazy. dp_result is iterable, but iterating it would
# fail until budget is computed (below).
# It’s possible to call DPEngine.aggregate multiple times with different
# metrics to compute.
dp_result = dp_engine.aggregate(
movie_views,
params,
data_extractors,
public_partitions=list(range(1, 100)),
out_explain_computation_report=explain_computation_report)
budget_accountant.compute_budgets()
# Generate the Explain Computation report. It must be called after
# budget_accountant.compute_budgets().
print(explain_computation_report.text())
# Here's where the lazy iterator initiates computations and gets transformed
# into actual results
dp_result = list(dp_result)
# Save the results
write_to_file(dp_result, FLAGS.output_file)
return 0
if __name__ == '__main__':
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_file")
app.run(main)