Skip to content

Commit

Permalink
Misc. fixes for plots and notebooks (#190)
Browse files Browse the repository at this point in the history
* Add a title to the plots

* code paragraphs for notebook... contaminates script

* New methods to only render paragraphs for notebook

* script and notebook get different outputs

* str -> finish

* use black for formatting

* add black as package dep

* column names from user could start with numbers

* better confidence note in script
  • Loading branch information
mccalluc authored Dec 3, 2024
1 parent f17d606 commit 5e82695
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 111 deletions.
1 change: 1 addition & 0 deletions dp_wizard/app/components/column_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,5 @@ def column_plot():
histogram,
error=accuracy,
cutoff=0, # TODO
title=f"Simulated {name}, assuming normal distribution",
)
180 changes: 121 additions & 59 deletions dp_wizard/utils/code_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from abc import ABC, abstractmethod
from pathlib import Path
import re

import black

from dp_wizard.utils.csv_helper import name_to_identifier
from dp_wizard.utils.code_generators._template import Template
from dp_wizard.utils.dp_helper import confidence
Expand Down Expand Up @@ -37,35 +40,24 @@ def _make_extra_blocks(self):
return {}

def make_py(self):
return str(
Template(self.root_template).fill_blocks(
code = (
Template(self.root_template)
.fill_blocks(
IMPORTS_BLOCK=_make_imports(),
COLUMNS_BLOCK=self._make_columns(),
CONTEXT_BLOCK=self._make_context(),
QUERIES_BLOCK=self._make_queries(),
**self._make_extra_blocks(),
)
.finish()
)
return black.format_str(code, mode=black.Mode())

def _make_margins_dict(self, bin_names: Iterable[str]):
# TODO: Don't worry too much about the formatting here.
# Plan to run the output through black for consistency.
# https://github.com/opendp/dp-creator-ii/issues/50
margins = (
[
"""
(): dp.polars.Margin(
public_info="lengths",
),"""
]
+ [
f"""
("{bin_name}",): dp.polars.Margin(
public_info="keys",
),"""
for bin_name in bin_names
]
)
margins = ["(): dp.polars.Margin(public_info='lengths',),"] + [
f"('{bin_name}',): dp.polars.Margin(public_info='keys',),"
for bin_name in bin_names
]

margins_dict = "{" + "".join(margins) + "\n }"
return margins_dict
Expand All @@ -81,14 +73,79 @@ def _make_columns(self):
for name, col in self.columns.items()
)

def _make_pre(self) -> str:
"""
If generating a notebook, this will open a new code paragraph.
"""
return ""

def _make_post(self) -> str:
"""
If generating a notebook, this will close a new code paragraph.
"""
return ""

def _make_confidence_note(self):
return f"{int(confidence * 100)}% confidence interval"

def _make_queries(self):
confidence_note = (
"The actual value is within the shown range "
f"with {int(confidence * 100)}% confidence."
)
pre = self._make_pre()
post = self._make_post()
column_names = self.columns.keys()
return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join(
_make_query(column_name) for column_name in column_names
return (
f"{pre}confidence = {confidence} # {self._make_confidence_note()}\n{post}"
+ "\n".join(
f"{pre}{self._make_query(column_name)}{post}"
for column_name in column_names
)
)

# def _make_queries(self):
# confidence_note = (
# "The actual value is within the shown range "
# f"with {int(confidence * 100)}% confidence."
# )
# column_names = self.columns.keys()
# return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join(
# _make_query(column_name) for column_name in column_names

def _make_query(self, column_name):
indentifier = name_to_identifier(column_name)
title = f"DP counts for {column_name}"
accuracy_name = f"{indentifier}_accuracy"
histogram_name = f"{indentifier}_histogram"
return (
Template("query")
.fill_values(
BIN_NAME=f"{indentifier}_bin",
)
.fill_expressions(
QUERY_NAME=f"{indentifier}_query",
ACCURACY_NAME=accuracy_name,
HISTOGRAM_NAME=histogram_name,
)
.fill_blocks(
OUTPUT_BLOCK=self._make_output(
title=title,
accuracy_name=accuracy_name,
histogram_name=histogram_name,
)
)
.finish()
)

def _make_output(self, title: str, accuracy_name: str, histogram_name: str):
return (
Template(f"{self.root_template}_output")
.fill_values(
TITLE=title,
)
.fill_expressions(
ACCURACY_NAME=accuracy_name,
HISTOGRAM_NAME=histogram_name,
CONFIDENCE_NOTE=self._make_confidence_note(),
)
.finish()
)

def _make_partial_context(self):
Expand Down Expand Up @@ -118,29 +175,34 @@ class NotebookGenerator(_CodeGenerator):
root_template = "notebook"

def _make_context(self):
return str(self._make_partial_context().fill_values(CSV_PATH=self.csv_path))
return self._make_partial_context().fill_values(CSV_PATH=self.csv_path).finish()

def _make_pre(self):
return "# +\n"

def _make_post(self):
return "# -\n"

def _make_extra_blocks(self):
outputs_expression = (
"{"
+ ",".join(
str(
Template("report_kv")
.fill_values(
NAME=name,
CONFIDENCE=confidence,
)
.fill_expressions(
IDENTIFIER_HISTOGRAM=f"{name_to_identifier(name)}_histogram",
IDENTIFIER_ACCURACY=f"{name_to_identifier(name)}_accuracy",
)
Template("report_kv")
.fill_values(
NAME=name,
CONFIDENCE=confidence,
)
.fill_expressions(
IDENTIFIER_HISTOGRAM=f"{name_to_identifier(name)}_histogram",
IDENTIFIER_ACCURACY=f"{name_to_identifier(name)}_accuracy",
)
.finish()
for name in self.columns.keys()
)
+ "}"
)
tmp_path = Path(__file__).parent.parent.parent / "tmp"
reports_block = str(
reports_block = (
Template("reports")
.fill_expressions(
OUTPUTS=outputs_expression,
Expand All @@ -152,6 +214,7 @@ def _make_extra_blocks(self):
TXT_REPORT_PATH=str(tmp_path / "report.txt"),
CSV_REPORT_PATH=str(tmp_path / "report.csv"),
)
.finish()
)
return {"REPORTS_BLOCK": reports_block}

Expand All @@ -160,19 +223,26 @@ class ScriptGenerator(_CodeGenerator):
root_template = "script"

def _make_context(self):
return str(self._make_partial_context().fill_expressions(CSV_PATH="csv_path"))
return (
self._make_partial_context().fill_expressions(CSV_PATH="csv_path").finish()
)

def _make_confidence_note(self):
# In the superclass, the string is unquoted so it can be
# used in comments: It needs to be wrapped here.
return repr(super()._make_confidence_note())


# Public functions used to generate code snippets in the UI;
# These do not require an entire analysis plan, so they stand on their own.


def make_privacy_unit_block(contributions: int):
return str(Template("privacy_unit").fill_values(CONTRIBUTIONS=contributions))
return Template("privacy_unit").fill_values(CONTRIBUTIONS=contributions).finish()


def make_privacy_loss_block(epsilon: float):
return str(Template("privacy_loss").fill_values(EPSILON=epsilon))
return Template("privacy_loss").fill_values(EPSILON=epsilon).finish()


def make_column_config_block(
Expand Down Expand Up @@ -202,7 +272,7 @@ def make_column_config_block(
<BLANKLINE>
"""
snake_name = _snake_case(name)
return str(
return (
Template("column_config")
.fill_expressions(
CUT_LIST_NAME=f"{snake_name}_cut_points",
Expand All @@ -215,6 +285,7 @@ def make_column_config_block(
COLUMN_NAME=name,
BIN_COLUMN_NAME=f"{snake_name}_bin",
)
.finish()
)


Expand All @@ -223,31 +294,22 @@ def make_column_config_block(
# so it's better to keep them out of the class.


def _make_query(column_name):
indentifier = name_to_identifier(column_name)
return str(
Template("query")
.fill_values(
BIN_NAME=f"{indentifier}_bin",
)
.fill_expressions(
QUERY_NAME=f"{indentifier}_query",
ACCURACY_NAME=f"{indentifier}_accuracy",
HISTOGRAM_NAME=f"{indentifier}_histogram",
)
)


def _snake_case(name: str):
"""
>>> _snake_case("HW GRADE")
'hw_grade'
>>> _snake_case("123")
'_123'
"""
return re.sub(r"\W+", "_", name.lower())
snake = re.sub(r"\W+", "_", name.lower())
# TODO: More validation in UI so we don't get zero-length strings.
if snake == "" or not re.match(r"[a-z]", snake[0]):
snake = f"_{snake}"
return snake


def _make_imports():
return (
str(Template("imports").fill_values())
Template("imports").fill_values().finish()
+ (Path(__file__).parent.parent / "shared.py").read_text()
)
11 changes: 10 additions & 1 deletion dp_wizard/utils/code_generators/_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def _find_slots(self):
return set(re.findall(slot_re, self._template))

def fill_expressions(self, **kwargs):
"""
Fill in variable names, or dicts or lists represented as strings.
"""
for k, v in kwargs.items():
k_re = re.escape(k)
self._template, count = re.subn(rf"\b{k_re}\b", str(v), self._template)
Expand All @@ -37,6 +40,9 @@ def fill_expressions(self, **kwargs):
return self

def fill_values(self, **kwargs):
"""
Fill in string or numeric values. `repr` is called before filling.
"""
for k, v in kwargs.items():
k_re = re.escape(k)
self._template, count = re.subn(rf"\b{k_re}\b", repr(v), self._template)
Expand All @@ -48,6 +54,9 @@ def fill_values(self, **kwargs):
return self

def fill_blocks(self, **kwargs):
"""
Fill in code blocks. Slot must be alone on line.
"""
for k, v in kwargs.items():

def match_indent(match):
Expand Down Expand Up @@ -76,7 +85,7 @@ def match_indent(match):
raise Exception(base_message)
return self

def __str__(self):
def finish(self):
unfilled_slots = self._initial_slots & self._find_slots()
if unfilled_slots:
slots_str = ", ".join(sorted(f"'{slot}'" for slot in unfilled_slots))
Expand Down
1 change: 0 additions & 1 deletion dp_wizard/utils/code_generators/no-tests/_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#
# Finally, we run the queries and plot the results.

# +
QUERIES_BLOCK
# -

Expand Down
2 changes: 2 additions & 0 deletions dp_wizard/utils/code_generators/no-tests/_notebook_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# CONFIDENCE_NOTE
plot_histogram(HISTOGRAM_NAME, error=ACCURACY_NAME, cutoff=0, title=TITLE)
2 changes: 1 addition & 1 deletion dp_wizard/utils/code_generators/no-tests/_query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
QUERY_NAME = context.query().group_by(BIN_NAME).agg(pl.len().dp.noise())
ACCURACY_NAME = QUERY_NAME.summarize(alpha=1 - confidence)["accuracy"].item()
HISTOGRAM_NAME = QUERY_NAME.release().collect().sort(BIN_NAME)
plot_histogram(HISTOGRAM_NAME, ACCURACY_NAME, 0)
OUTPUT_BLOCK
4 changes: 3 additions & 1 deletion dp_wizard/utils/code_generators/no-tests/_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ def get_context(csv_path):
parser = ArgumentParser(
description="Creates a differentially private release from a csv"
)
parser.add_argument("--csv", help="Path to csv containing private data")
parser.add_argument(
"--csv", required=True, help="Path to csv containing private data"
)
args = parser.parse_args()
context = get_context(csv_path=args.csv)

Expand Down
3 changes: 3 additions & 0 deletions dp_wizard/utils/code_generators/no-tests/_script_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
print(TITLE)
print(CONFIDENCE_NOTE, ACCURACY_NAME)
print(HISTOGRAM_NAME)
3 changes: 2 additions & 1 deletion dp_wizard/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def df_to_columns(df: DataFrame):


def plot_histogram(
histogram_df: DataFrame, error: float, cutoff: float
histogram_df: DataFrame, error: float, cutoff: float, title: str
): # pragma: no cover
"""
Given a Dataframe for a histogram, plot the data.
Expand All @@ -58,3 +58,4 @@ def plot_histogram(
axes.set_xticks(minors, ["" for _ in minors], minor=True)
axes.axhline(cutoff, color="lightgrey", zorder=-1)
axes.set_ylim(bottom=0)
axes.set_title(title)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies = [
"jupyter-client",
"nbconvert",
"ipykernel",
"black",
"pyyaml",
]

Expand Down
Loading

0 comments on commit 5e82695

Please sign in to comment.