Skip to content

Commit

Permalink
Handle empty column headers (#147)
Browse files Browse the repository at this point in the history
* tests pass: do not really like ui

* demo that we can handle spaces

* [blank] for empty headers

* Show labels in UI instead of names or IDs

* What if two columns are only distinguished by a non-word character?

* duplicated columns: another CSV edge case
  • Loading branch information
mccalluc authored Nov 7, 2024
1 parent 87b6f8e commit ec70686
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 31 deletions.
26 changes: 20 additions & 6 deletions WHAT-WE-LEARNED.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ Typing might help here. I've also wondered about naming conventions, but I haven
It seems like the returned value would be the same, so I would like to compress something like this:
```python
@reactive.calc
def csv_fields_calc():
return read_field_names(req(csv_path()))
def csv_labels_calc():
return read_labels(req(csv_path()))

@render.text
def csv_fields():
return csv_fields_calc()
def csv_labels():
return csv_labels_calc()
```
into:
```python
@reactive.calc
@render.text
def csv_fields():
return read_field_names(req(csv_path()))
def csv_labels():
return read_labels(req(csv_path()))
```
but that returns an error:
```
Expand All @@ -44,6 +44,20 @@ Renderer.__call__() missing 1 required positional argument: '_fn'

It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.

## Unstated requirements for module IDs

The [docs](https://shiny.posit.co/py/docs/modules.html#how-to-use-modules) only say:

> This id has two requirements. First, it must be unique in a single scope, and can’t be duplicated in a given application or module definition. ... Second, the UI and server ids must match.
But it's fussier than that:

```
ValueError: The string 'Will this work?' is not a valid id; only letters, numbers, and underscore are permitted
```

Was planning to just use the CSV column headers as IDs, but that's not going to work. Could Shiny just hash whatever we provide, so we wouldn't have to worry about this?

## Normal tooling doesn't work inside of app?

There are several bits of tooling that don't seem to work inside end-to-end app tests. My guess is that this isn't related to Shiny per se, but rather the ASGI framework: It's not running in the same process as pytest, so it's not surprising that the pytest process can't instrument this.
Expand Down
20 changes: 11 additions & 9 deletions dp_creator_ii/app/analysis_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dp_creator_ii.app.components.inputs import log_slider
from dp_creator_ii.app.components.column_module import column_ui, column_server
from dp_creator_ii.utils.csv_helper import read_field_names
from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip
from dp_creator_ii.utils.templates import make_privacy_loss_block

Expand Down Expand Up @@ -68,7 +68,7 @@ def _update_checkbox_group():
ui.update_checkbox_group(
"columns_checkbox_group",
label=None,
choices=csv_fields_calc(),
choices=csv_ids_labels_calc(),
)

@reactive.effect
Expand All @@ -91,10 +91,12 @@ def columns_checkbox_group_tooltip_ui():
@render.ui
def columns_ui():
column_ids = input.columns_checkbox_group()
column_ids_to_names = csv_ids_names_calc()
column_ids_to_labels = csv_ids_labels_calc()
for column_id in column_ids:
column_server(
column_id,
name=column_id,
name=column_ids_to_names[column_id],
contributions=contributions(),
epsilon=epsilon_calc(),
set_column_weight=set_column_weight,
Expand All @@ -103,19 +105,19 @@ def columns_ui():
)
return [
[
ui.h3(column_id),
ui.h3(column_ids_to_labels[column_id]),
column_ui(column_id),
]
for column_id in column_ids
]

@reactive.calc
def csv_fields_calc():
return read_field_names(req(csv_path()))
def csv_ids_names_calc():
return read_csv_ids_names(req(csv_path()))

@render.text
def csv_fields():
return csv_fields_calc()
@reactive.calc
def csv_ids_labels_calc():
return read_csv_ids_labels(req(csv_path()))

@render.ui
def epsilon_tooltip_ui():
Expand Down
26 changes: 25 additions & 1 deletion dp_creator_ii/utils/csv_helper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
"""
We'll use the following terms consistently throughout the application:
- name: This is the exact column header in the CSV.
- label: This is the string we'll display.
- id: This is the string we'll pass as a module ID.
"""

import polars as pl


def read_field_names(csv_path):
def read_csv_names(csv_path):
# Polars is overkill, but it is more robust against
# variations in encoding than Python stdlib csv.
# However, it could be slow:
Expand All @@ -10,3 +17,20 @@ def read_field_names(csv_path):
# > resolving its schema, which is a potentially expensive operation.
lf = pl.scan_csv(csv_path)
return lf.collect_schema().names()


def read_csv_ids_labels(csv_path):
return {
name_to_id(name): f"{i+1}: {name or '[blank]'}"
for i, name in enumerate(read_csv_names(csv_path))
}


def read_csv_ids_names(csv_path):
return {name_to_id(name): name for name in read_csv_names(csv_path)}


def name_to_id(name):
# Shiny is fussy about module IDs,
# but we don't need them to be human readable.
return str(hash(name)).replace("-", "_")
14 changes: 7 additions & 7 deletions tests/fixtures/fake.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
student_id,class_year,hw_number,grade
1234,1,1,90
1234,1,2,95
1234,1,3,85
6789,2,1,70
6789,2,2,100
6789,2,3,90
,class year,class year,hw number,hw-number,grade
1234,1,1,1,4,90
1234,1,1,2,5,95
1234,1,1,3,6,85
6789,2,2,1,4,70
6789,2,2,2,5,100
6789,2,2,3,6,90
10 changes: 6 additions & 4 deletions tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ def expect_no_error():
expect_visible(perform_analysis_text)
expect_not_visible(download_results_text)
# Columns:
expect_visible("student_id")
expect_visible("class_year")
expect_visible("hw_number")
expect_visible("grade")
expect_visible("1: [blank]") # Empty column!
expect_visible("2: class year")
expect_visible("3: class year_duplicated_0") # Duplicated column!
expect_visible("4: hw number")
expect_visible("5: hw-number") # Distinguished by non-word character!
expect_visible("6: grade")
# Epsilon slider:
# (Note: Slider tests failed on CI when run after column details,
# although it worked locally. This works in either environment.
Expand Down
14 changes: 10 additions & 4 deletions tests/utils/test_csv_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
import pytest

from dp_creator_ii.utils.csv_helper import read_field_names
from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names


# We will not reference the encoding when reading:
Expand Down Expand Up @@ -48,6 +48,12 @@ def test_csv_loading(write_encoding):
else:
pl_testing.assert_frame_equal(write_lf, read_lf)

# Preceding lines are reading the whole DF via Polars.
field_names_read = read_field_names(fp.name)
assert field_names_read == list(data.keys())
# Preceding lines are reading the whole DF via Polars:
# Now test how we read just the headers.
# Keys are hashes, and won't be stable across platforms,
# so let's just look at the values.
ids_labels = read_csv_ids_labels(fp.name)
assert set(ids_labels.values()) == {"2: AGE", "1: NAME"}

ids_names = read_csv_ids_names(fp.name)
assert set(ids_names.values()) == {"AGE", "NAME"}

0 comments on commit ec70686

Please sign in to comment.