Handle empty column headers (#147)

* tests pass: do not really like ui * demo that we can handle spaces * [blank] for empty headers * Show labels in UI instead of names or IDs * What if two columns are only distinguished by a non-word character? * duplicated columns: another CSV edge case
opendp · Nov 7, 2024 · ec70686 · ec70686
1 parent 87b6f8e
commit ec70686
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 31 deletions.
diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
@@ -21,19 +21,19 @@ Typing might help here. I've also wondered about naming conventions, but I haven
 It seems like the returned value would be the same, so I would like to compress something like this:
 ```python
 @reactive.calc
-def csv_fields_calc():
-    return read_field_names(req(csv_path()))
+def csv_labels_calc():
+    return read_labels(req(csv_path()))
 
 @render.text
-def csv_fields():
-    return csv_fields_calc()
+def csv_labels():
+    return csv_labels_calc()
 ```
 into:
 ```python
 @reactive.calc
 @render.text
-def csv_fields():
-    return read_field_names(req(csv_path()))
+def csv_labels():
+    return read_labels(req(csv_path()))
 ```
 but that returns an error:
 ```
@@ -44,6 +44,20 @@ Renderer.__call__() missing 1 required positional argument: '_fn'
 
 It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.
 
+## Unstated requirements for module IDs
+
+The [docs](https://shiny.posit.co/py/docs/modules.html#how-to-use-modules) only say:
+
+> This id has two requirements. First, it must be unique in a single scope, and can’t be duplicated in a given application or module definition. ... Second, the UI and server ids must match.
+
+But it's fussier than that:
+
+```
+ValueError: The string 'Will this work?' is not a valid id; only letters, numbers, and underscore are permitted
+```
+
+Was planning to just use the CSV column headers as IDs, but that's not going to work. Could Shiny just hash whatever we provide, so we wouldn't have to worry about this?
+
 ## Normal tooling doesn't work inside of app?
 
 There are several bits of tooling that don't seem to work inside end-to-end app tests. My guess is that this isn't related to Shiny per se, but rather the ASGI framework: It's not running in the same process as pytest, so it's not surprising that the pytest process can't instrument this.

diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
@@ -4,7 +4,7 @@
 
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip
 from dp_creator_ii.utils.templates import make_privacy_loss_block
 
@@ -68,7 +68,7 @@ def _update_checkbox_group():
         ui.update_checkbox_group(
             "columns_checkbox_group",
             label=None,
-            choices=csv_fields_calc(),
+            choices=csv_ids_labels_calc(),
         )
 
     @reactive.effect
@@ -91,10 +91,12 @@ def columns_checkbox_group_tooltip_ui():
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
+        column_ids_to_names = csv_ids_names_calc()
+        column_ids_to_labels = csv_ids_labels_calc()
         for column_id in column_ids:
             column_server(
                 column_id,
-                name=column_id,
+                name=column_ids_to_names[column_id],
                 contributions=contributions(),
                 epsilon=epsilon_calc(),
                 set_column_weight=set_column_weight,
@@ -103,19 +105,19 @@ def columns_ui():
             )
         return [
             [
-                ui.h3(column_id),
+                ui.h3(column_ids_to_labels[column_id]),
                 column_ui(column_id),
             ]
             for column_id in column_ids
         ]
 
     @reactive.calc
-    def csv_fields_calc():
-        return read_field_names(req(csv_path()))
+    def csv_ids_names_calc():
+        return read_csv_ids_names(req(csv_path()))
 
-    @render.text
-    def csv_fields():
-        return csv_fields_calc()
+    @reactive.calc
+    def csv_ids_labels_calc():
+        return read_csv_ids_labels(req(csv_path()))
 
     @render.ui
     def epsilon_tooltip_ui():

diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
@@ -1,7 +1,14 @@
+"""
+We'll use the following terms consistently throughout the application:
+- name: This is the exact column header in the CSV.
+- label: This is the string we'll display.
+- id: This is the string we'll pass as a module ID.
+"""
+
 import polars as pl
 
 
-def read_field_names(csv_path):
+def read_csv_names(csv_path):
     # Polars is overkill, but it is more robust against
     # variations in encoding than Python stdlib csv.
     # However, it could be slow:
@@ -10,3 +17,20 @@ def read_field_names(csv_path):
     # > resolving its schema, which is a potentially expensive operation.
     lf = pl.scan_csv(csv_path)
     return lf.collect_schema().names()
+
+
+def read_csv_ids_labels(csv_path):
+    return {
+        name_to_id(name): f"{i+1}: {name or '[blank]'}"
+        for i, name in enumerate(read_csv_names(csv_path))
+    }
+
+
+def read_csv_ids_names(csv_path):
+    return {name_to_id(name): name for name in read_csv_names(csv_path)}
+
+
+def name_to_id(name):
+    # Shiny is fussy about module IDs,
+    # but we don't need them to be human readable.
+    return str(hash(name)).replace("-", "_")
diff --git a/tests/fixtures/fake.csv b/tests/fixtures/fake.csv
@@ -1,7 +1,7 @@
-student_id,class_year,hw_number,grade
-1234,1,1,90
-1234,1,2,95
-1234,1,3,85
-6789,2,1,70
-6789,2,2,100
-6789,2,3,90
+,class year,class year,hw number,hw-number,grade
+1234,1,1,1,4,90
+1234,1,1,2,5,95
+1234,1,1,3,6,85
+6789,2,2,1,4,70
+6789,2,2,2,5,100
+6789,2,2,3,6,90
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -62,10 +62,12 @@ def expect_no_error():
     expect_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
     # Columns:
-    expect_visible("student_id")
-    expect_visible("class_year")
-    expect_visible("hw_number")
-    expect_visible("grade")
+    expect_visible("1: [blank]")  # Empty column!
+    expect_visible("2: class year")
+    expect_visible("3: class year_duplicated_0")  # Duplicated column!
+    expect_visible("4: hw number")
+    expect_visible("5: hw-number")  # Distinguished by non-word character!
+    expect_visible("6: grade")
     # Epsilon slider:
     # (Note: Slider tests failed on CI when run after column details,
     # although it worked locally. This works in either environment.

diff --git a/tests/utils/test_csv_helper.py b/tests/utils/test_csv_helper.py
@@ -4,7 +4,7 @@
 import tempfile
 import pytest
 
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 
 
 # We will not reference the encoding when reading:
@@ -48,6 +48,12 @@ def test_csv_loading(write_encoding):
         else:
             pl_testing.assert_frame_equal(write_lf, read_lf)
 
-        # Preceding lines are reading the whole DF via Polars.
-        field_names_read = read_field_names(fp.name)
-        assert field_names_read == list(data.keys())
+        # Preceding lines are reading the whole DF via Polars:
+        # Now test how we read just the headers.
+        # Keys are hashes, and won't be stable across platforms,
+        # so let's just look at the values.
+        ids_labels = read_csv_ids_labels(fp.name)
+        assert set(ids_labels.values()) == {"2: AGE", "1: NAME"}
+
+        ids_names = read_csv_ids_names(fp.name)
+        assert set(ids_names.values()) == {"AGE", "NAME"}