From 266d1d2dd1c429e56ff3fdcd0e46546151b2f72d Mon Sep 17 00:00:00 2001
From: lindsay stevens <lstevens@getodk.org>
Date: Wed, 30 Oct 2024 22:02:28 +1100
Subject: [PATCH 1/3] dev: update dependencies

- no formatting changes, just keeping pace with ruff really
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 85d1aa4f..f0b7476d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,8 +18,8 @@ dependencies = [
 dev = [
   "formencode==2.1.0",  # Compare XML
   "lxml==5.3.0",        # XPath test expressions
-  "psutil==6.0.0",      # Process info for performance tests
-  "ruff==0.6.4",        # Format and lint
+  "psutil==6.1.0",      # Process info for performance tests
+  "ruff==0.7.1",        # Format and lint
 ]
 
 [project.urls]

From b81e816a90b5a7b3b0e90194cfed46b342585e0e Mon Sep 17 00:00:00 2001
From: lindsay stevens <lstevens@getodk.org>
Date: Wed, 30 Oct 2024 22:13:33 +1100
Subject: [PATCH 2/3] chg: add all choices in the XLSForm to the XForm even if
 not referenced

- allows users to include reference data for expressions via choices
  without needing a hidden dummy question. Includes all choices rather
  than parsing expressions to find references, because it seems unlikely
  that both a XLSForm would include completely irrelevant choices and
  have so many such choices that the XForm document size is a problem.
- test_xls2json_xls.py
  - moved loop test into test_loop.py and modified to use the same style
    as the existing test
- test_loop.py
  - use xls2xform.convert rather than internals directly
---
 pyxform/xls2json.py                           |  15 +--
 tests/test_choices_sheet.py                   |  21 ++++
 .../test_expected_output/repeat_date_test.xml | 100 +++++++++++++++++-
 tests/test_expected_output/simple_loop.json   |  57 ----------
 tests/test_expected_output/xml_escaping.xml   | 100 +++++++++++++++++-
 tests/test_loop.py                            |  76 +++++++++++--
 tests/test_xls2json_xls.py                    |  12 ---
 7 files changed, 289 insertions(+), 92 deletions(-)
 delete mode 100644 tests/test_expected_output/simple_loop.json

diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
index 1e340b35..3b260e22 100644
--- a/pyxform/xls2json.py
+++ b/pyxform/xls2json.py
@@ -520,13 +520,14 @@ def workbook_to_json(
         use_double_colons=use_double_colons,
         default_language=default_language,
     )
-    combined_lists = group_dictionaries_by_key(
+    choices = group_dictionaries_by_key(
         list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S
     )
+    if 0 < len(choices):
+        json_dict[constants.CHOICES] = choices
     # To combine the warning into one message, the check for missing choices translation
     # columns is run with Survey sheet below.
 
-    choices = combined_lists
     # Make sure all the options have the required properties:
     warnedabout = set()
     for list_name, options in choices.items():
@@ -1242,16 +1243,6 @@ def workbook_to_json(
                     choice_filter=row.get(constants.CHOICE_FILTER),
                     file_extension=file_extension,
                 )
-                # Add the choice to the survey choices if it's being used in an itemset.
-                if (
-                    constants.ITEMSET in new_json_dict
-                    and new_json_dict[constants.ITEMSET] == list_name
-                    and list_name in choices
-                ):
-                    # Initialise choices output if none added already.
-                    if constants.CHOICES not in json_dict:
-                        json_dict[constants.CHOICES] = {}
-                    json_dict[constants.CHOICES][list_name] = choices[list_name]
 
                 # Code to deal with table_list appearance flags
                 # (for groups of selects)
diff --git a/tests/test_choices_sheet.py b/tests/test_choices_sheet.py
index 114ccfc3..084b9b56 100644
--- a/tests/test_choices_sheet.py
+++ b/tests/test_choices_sheet.py
@@ -133,3 +133,24 @@ def test_choices_extra_columns_output_order_matches_xlsform(self):
                 """,
             ],
         )
+
+    def test_unreferenced_lists_included_in_output(self):
+        """Should find all specified choice lists in the output, even if unreferenced."""
+        md = """
+        | survey  |
+        |         | type               | name | label |
+        |         | select_one choices | a    | A     |
+        | choices |
+        |         | list_name | name | label |
+        |         | choices   | 1    | Y     |
+        |         | choices   | 2    | N     |
+        |         | choices2  | 1    | Y     |
+        |         | choices2  | 2    | N     |
+        """
+        self.assertPyxformXform(
+            md=md,
+            xml__xpath_match=[
+                xpc.model_instance_choices_label("choices", (("1", "Y"), ("2", "N"))),
+                xpc.model_instance_choices_label("choices2", (("1", "Y"), ("2", "N"))),
+            ],
+        )
diff --git a/tests/test_expected_output/repeat_date_test.xml b/tests/test_expected_output/repeat_date_test.xml
index 2212f014..0840bb55 100644
--- a/tests/test_expected_output/repeat_date_test.xml
+++ b/tests/test_expected_output/repeat_date_test.xml
@@ -3,6 +3,22 @@
   <h:head>
     <h:title>repeat_date_test</h:title>
     <model odk:xforms-version="1.0.0">
+      <itext>
+        <translation lang="default" default="true()">
+          <text id="a_b-0">
+            <value form="image">jr://images/a.jpg</value>
+          </text>
+          <text id="a_b-1">
+            <value form="image">jr://images/b.jpg</value>
+          </text>
+          <text id="happy_sad-0">
+            <value form="image">jr://images/happy.jpg</value>
+          </text>
+          <text id="happy_sad-1">
+            <value form="image">jr://images/sad.jpg</value>
+          </text>
+        </translation>
+      </itext>
       <instance>
         <repeat_date_test id="repeat_date_test">
           <generated_note_name_2/>
@@ -21,15 +37,95 @@
           </meta>
         </repeat_date_test>
       </instance>
+      <instance id="list">
+        <root>
+          <item>
+            <name>a</name>
+            <label>option a</label>
+          </item>
+          <item>
+            <name>b</name>
+            <label>option b</label>
+          </item>
+          <item>
+            <name>c</name>
+            <label>option c</label>
+          </item>
+          <item>
+            <name>d</name>
+            <label>option d</label>
+          </item>
+        </root>
+      </instance>
+      <instance id="list2">
+        <root>
+          <item>
+            <name>1</name>
+            <label>option 1</label>
+          </item>
+          <item>
+            <name>2</name>
+            <label>option 2</label>
+          </item>
+          <item>
+            <name>3</name>
+            <label>option 3</label>
+          </item>
+          <item>
+            <name>4</name>
+            <label>option 4</label>
+          </item>
+          <item>
+            <name>5</name>
+            <label>option 5</label>
+          </item>
+          <item>
+            <name>6</name>
+            <label>option 6</label>
+          </item>
+          <item>
+            <name>7</name>
+            <label>option 7</label>
+          </item>
+          <item>
+            <name>8</name>
+            <label>option 8</label>
+          </item>
+        </root>
+      </instance>
       <instance id="yes_no">
         <root>
           <item>
-            <label>Yes</label>
             <name>yes</name>
+            <label>Yes</label>
           </item>
           <item>
-            <label>No</label>
             <name>no</name>
+            <label>No</label>
+          </item>
+        </root>
+      </instance>
+      <instance id="a_b">
+        <root>
+          <item>
+            <itextId>a_b-0</itextId>
+            <name>a</name>
+          </item>
+          <item>
+            <itextId>a_b-1</itextId>
+            <name>b</name>
+          </item>
+        </root>
+      </instance>
+      <instance id="happy_sad">
+        <root>
+          <item>
+            <itextId>happy_sad-0</itextId>
+            <name>happy</name>
+          </item>
+          <item>
+            <itextId>happy_sad-1</itextId>
+            <name>sad</name>
           </item>
         </root>
       </instance>
diff --git a/tests/test_expected_output/simple_loop.json b/tests/test_expected_output/simple_loop.json
deleted file mode 100644
index 6f2bba94..00000000
--- a/tests/test_expected_output/simple_loop.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-    "name": "simple_loop", 
-    "title": "simple_loop", 
-    "sms_keyword": "simple_loop", 
-    "default_language": "default", 
-    "id_string": "simple_loop", 
-    "type": "survey", 
-    "children": [
-        {
-            "children": [
-                {
-                    "type": "integer", 
-                    "name": "count", 
-                    "label": {
-                        "English": "How many are there in this group?"
-                    }
-                }
-            ], 
-            "type": "loop", 
-            "name": "my_table", 
-            "columns": [
-                {
-                    "name": "col1", 
-                    "label": {
-                        "English": "Column 1"
-                    }
-                }, 
-                {
-                    "name": "col2", 
-                    "label": {
-                        "English": "Column 2"
-                    }
-                }
-            ], 
-            "label": {
-                "English": "My Table"
-            }
-        }, 
-        {
-            "control": {
-                "bodyless": true
-            }, 
-            "type": "group", 
-            "name": "meta", 
-            "children": [
-                {
-                    "bind": {
-                        "readonly": "true()", 
-                        "jr:preload": "uid"
-                    }, 
-                    "type": "calculate", 
-                    "name": "instanceID"
-                }
-            ]
-        }
-    ]
-}
\ No newline at end of file
diff --git a/tests/test_expected_output/xml_escaping.xml b/tests/test_expected_output/xml_escaping.xml
index dd6b849d..f74eb588 100644
--- a/tests/test_expected_output/xml_escaping.xml
+++ b/tests/test_expected_output/xml_escaping.xml
@@ -3,6 +3,22 @@
   <h:head>
     <h:title>xml_escaping</h:title>
     <model odk:xforms-version="1.0.0">
+      <itext>
+        <translation lang="default" default="true()">
+          <text id="a_b-0">
+            <value form="image">jr://images/a.jpg</value>
+          </text>
+          <text id="a_b-1">
+            <value form="image">jr://images/b.jpg</value>
+          </text>
+          <text id="happy_sad-0">
+            <value form="image">jr://images/happy.jpg</value>
+          </text>
+          <text id="happy_sad-1">
+            <value form="image">jr://images/sad.jpg</value>
+          </text>
+        </translation>
+      </itext>
       <instance>
         <xml_escaping id="xml_escaping">
           <a/>
@@ -13,15 +29,95 @@
           </meta>
         </xml_escaping>
       </instance>
+      <instance id="list">
+        <root>
+          <item>
+            <name>a</name>
+            <label>option a</label>
+          </item>
+          <item>
+            <name>b</name>
+            <label>option b</label>
+          </item>
+          <item>
+            <name>c</name>
+            <label>option c</label>
+          </item>
+          <item>
+            <name>d</name>
+            <label>option d</label>
+          </item>
+        </root>
+      </instance>
+      <instance id="list2">
+        <root>
+          <item>
+            <name>1</name>
+            <label>option 1</label>
+          </item>
+          <item>
+            <name>2</name>
+            <label>option 2</label>
+          </item>
+          <item>
+            <name>3</name>
+            <label>option 3</label>
+          </item>
+          <item>
+            <name>4</name>
+            <label>option 4</label>
+          </item>
+          <item>
+            <name>5</name>
+            <label>option 5</label>
+          </item>
+          <item>
+            <name>6</name>
+            <label>option 6</label>
+          </item>
+          <item>
+            <name>7</name>
+            <label>option 7</label>
+          </item>
+          <item>
+            <name>8</name>
+            <label>option 8</label>
+          </item>
+        </root>
+      </instance>
       <instance id="yes_no">
         <root>
           <item>
-            <label>Yes</label>
             <name>yes</name>
+            <label>Yes</label>
           </item>
           <item>
-            <label>No</label>
             <name>no</name>
+            <label>No</label>
+          </item>
+        </root>
+      </instance>
+      <instance id="a_b">
+        <root>
+          <item>
+            <itextId>a_b-0</itextId>
+            <name>a</name>
+          </item>
+          <item>
+            <itextId>a_b-1</itextId>
+            <name>b</name>
+          </item>
+        </root>
+      </instance>
+      <instance id="happy_sad">
+        <root>
+          <item>
+            <itextId>happy_sad-0</itextId>
+            <name>happy</name>
+          </item>
+          <item>
+            <itextId>happy_sad-1</itextId>
+            <name>sad</name>
           </item>
         </root>
       </instance>
diff --git a/tests/test_loop.py b/tests/test_loop.py
index d4ca0d61..0e8689f4 100644
--- a/tests/test_loop.py
+++ b/tests/test_loop.py
@@ -4,18 +4,71 @@
 
 from unittest import TestCase
 
-from pyxform.builder import create_survey_from_xls
+from pyxform.xls2xform import convert
 
 from tests import utils
 
 
-class LoopTests(TestCase):
+class TestLoop(TestCase):
+    maxDiff = None
+
+    def test_table(self):
+        path = utils.path_to_text_fixture("simple_loop.xls")
+        observed = convert(xlsform=path)._pyxform
+
+        expected = {
+            "name": "data",
+            "title": "simple_loop",
+            "sms_keyword": "simple_loop",
+            "default_language": "default",
+            "id_string": "simple_loop",
+            "type": "survey",
+            "children": [
+                {
+                    "children": [
+                        {
+                            "type": "integer",
+                            "name": "count",
+                            "label": {"English": "How many are there in this group?"},
+                        }
+                    ],
+                    "type": "loop",
+                    "name": "my_table",
+                    "columns": [
+                        {"name": "col1", "label": {"English": "Column 1"}},
+                        {"name": "col2", "label": {"English": "Column 2"}},
+                    ],
+                    "label": {"English": "My Table"},
+                },
+                {
+                    "control": {"bodyless": True},
+                    "type": "group",
+                    "name": "meta",
+                    "children": [
+                        {
+                            "bind": {"readonly": "true()", "jr:preload": "uid"},
+                            "type": "calculate",
+                            "name": "instanceID",
+                        }
+                    ],
+                },
+            ],
+            "choices": {
+                "my_columns": [
+                    {"label": {"English": "Column 1"}, "name": "col1"},
+                    {"label": {"English": "Column 2"}, "name": "col2"},
+                ]
+            },
+        }
+        self.assertEqual(expected, observed)
+
     def test_loop(self):
         path = utils.path_to_text_fixture("another_loop.xls")
-        survey = create_survey_from_xls(path, "another_loop")
-        self.maxDiff = None
-        expected_dict = {
-            "name": "another_loop",
+        observed = convert(xlsform=path)._survey.to_json_dict()
+        observed.pop("_translations", None)
+        observed.pop("_xpath", None)
+        expected = {
+            "name": "data",
             "id_string": "another_loop",
             "sms_keyword": "another_loop",
             "default_language": "default",
@@ -89,5 +142,14 @@ def test_loop(self):
                     "type": "group",
                 },
             ],
+            "choices": {
+                "types": [
+                    {"label": {"English": "Car", "French": "Voiture"}, "name": "car"},
+                    {
+                        "label": {"English": "Motorcycle", "French": "Moto"},
+                        "name": "motor_cycle",
+                    },
+                ]
+            },
         }
-        self.assertEqual(survey.to_json_dict(), expected_dict)
+        self.assertEqual(expected, observed)
diff --git a/tests/test_xls2json_xls.py b/tests/test_xls2json_xls.py
index 349bc3ab..b362a6a6 100644
--- a/tests/test_xls2json_xls.py
+++ b/tests/test_xls2json_xls.py
@@ -103,18 +103,6 @@ def test_text_and_integer(self):
 
         self.assertEqual(x.to_json_dict()["children"], expected_dict)
 
-    def test_table(self):
-        filename = "simple_loop.xls"
-        path_to_excel_file = Path(example_xls.PATH) / filename
-        expected_output_path = Path(test_expected_output.PATH) / (
-            path_to_excel_file.stem + ".json"
-        )
-        result = convert(
-            xlsform=path_to_excel_file, warnings=[], form_name=path_to_excel_file.stem
-        )
-        with open(expected_output_path, encoding="utf-8") as expected:
-            self.assertEqual(json.load(expected), result._pyxform)
-
     def test_choice_filter_choice_fields(self):
         """
         Test that the choice filter fields appear on children field of json

From 4f1116e6b2876ecb123db25ae4e9bee6518414fc Mon Sep 17 00:00:00 2001
From: lindsay stevens <lstevens@getodk.org>
Date: Fri, 1 Nov 2024 01:57:16 +1100
Subject: [PATCH 3/3] chg: simplify validation of choices and tag names

- xls2json.py:
  - move choices checks to new validator module
  - include row reference in all errors/warnings. To have a consistent
    error message structure, each duplicate choice gets a copy of the
    message (in one error) rather than being smushed into one message.
- xlsparseutils.py
  - replace separate regex for xml tags with expression parser and use
    that instead, for consistent parsing rules and to use cache
  - move the remaining sheet misspellings check to new validator module
- expression.py
  - fix issue where a name containing "or" or "and" was parsed as an
    operator rather than a name. These tokens are only valid when
    surrounded by spaces so the regex is updated accordingly.
  - add tests for positive and negative match cases for tag names.
---
 pyxform/entities/entities_parsing.py          |   7 +-
 pyxform/parsing/expression.py                 |  18 ++-
 pyxform/survey_element.py                     |   4 +-
 pyxform/validators/pyxform/choices.py         |  59 ++++++++++
 pyxform/validators/pyxform/question_types.py  |   6 +-
 .../pyxform/sheet_misspellings.py}            |  13 ---
 pyxform/xls2json.py                           | 108 ++++++------------
 tests/parsing/__init__.py                     |   0
 tests/parsing/test_expression.py              |  49 ++++++++
 tests/test_fields.py                          |  14 +--
 tests/test_sheet_columns.py                   |   4 +-
 tests/test_xlsform_spec.py                    |  10 +-
 tests/xform_test_case/test_bugs.py            |   7 +-
 13 files changed, 189 insertions(+), 110 deletions(-)
 create mode 100644 pyxform/validators/pyxform/choices.py
 rename pyxform/{xlsparseutils.py => validators/pyxform/sheet_misspellings.py} (76%)
 create mode 100644 tests/parsing/__init__.py
 create mode 100644 tests/parsing/test_expression.py

diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py
index 017b6751..34f44e7c 100644
--- a/pyxform/entities/entities_parsing.py
+++ b/pyxform/entities/entities_parsing.py
@@ -3,7 +3,8 @@
 from pyxform import constants as const
 from pyxform.aliases import yes_no
 from pyxform.errors import PyXFormError
-from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
+from pyxform.parsing.expression import is_xml_tag
+from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
 
 EC = const.EntityColumns
 
@@ -73,7 +74,7 @@ def get_validated_dataset_name(entity):
             f"Invalid entity list name: '{dataset}'. Names may not include periods."
         )
 
-    if not is_valid_xml_tag(dataset):
+    if not is_xml_tag(dataset):
         if isinstance(dataset, bytes):
             dataset = dataset.decode("utf-8")
 
@@ -118,7 +119,7 @@ def validate_entity_saveto(
             f"{error_start} the entity property name '{save_to}' starts with reserved prefix {const.ENTITIES_RESERVED_PREFIX}."
         )
 
-    if not is_valid_xml_tag(save_to):
+    if not is_xml_tag(save_to):
         if isinstance(save_to, bytes):
             save_to = save_to.decode("utf-8")
 
diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
index af919859..29df4fa1 100644
--- a/pyxform/parsing/expression.py
+++ b/pyxform/parsing/expression.py
@@ -37,9 +37,9 @@ def get_expression_lexer() -> re.Scanner:
         "TIME": time_regex,
         "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
         # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
-        "OPS_MATH": r"[\*\+\-]|mod|div",
+        "OPS_MATH": r"[\*\+\-]| mod | div ",
         "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
-        "OPS_BOOL": r"and|or",
+        "OPS_BOOL": r" and | or ",
         "OPS_UNION": r"\|",
         "OPEN_PAREN": r"\(",
         "CLOSE_PAREN": r"\)",
@@ -107,3 +107,17 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
         return True
     else:
         return False
+
+
+def is_pyxform_reference(value: str) -> bool:
+    """
+    Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
+    """
+    return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
+
+
+def is_xml_tag(value: str) -> bool:
+    """
+    Does the input string contain only a valid XML tag / element name?
+    """
+    return is_single_token_expression(expression=value, token_types=("NAME",))
diff --git a/pyxform/survey_element.py b/pyxform/survey_element.py
index 504e83d0..3468f92f 100644
--- a/pyxform/survey_element.py
+++ b/pyxform/survey_element.py
@@ -11,6 +11,7 @@
 from pyxform import aliases as alias
 from pyxform import constants as const
 from pyxform.errors import PyXFormError
+from pyxform.parsing.expression import is_xml_tag
 from pyxform.question_type_dictionary import QUESTION_TYPE_DICT
 from pyxform.utils import (
     BRACKETED_TAG_REGEX,
@@ -19,7 +20,6 @@
     node,
 )
 from pyxform.xls2json import print_pyobj_to_json
-from pyxform.xlsparseutils import is_valid_xml_tag
 
 if TYPE_CHECKING:
     from pyxform.utils import DetachableElement
@@ -140,7 +140,7 @@ def add_children(self, children):
     SUPPORTED_MEDIA = ("image", "big-image", "audio", "video")
 
     def validate(self):
-        if not is_valid_xml_tag(self.name):
+        if not is_xml_tag(self.name):
             invalid_char = re.search(INVALID_XFORM_TAG_REGEXP, self.name)
             raise PyXFormError(
                 f"The name '{self.name}' contains an invalid character '{invalid_char.group(0)}'. Names {const.XML_IDENTIFIER_ERROR_MESSAGE}"
diff --git a/pyxform/validators/pyxform/choices.py b/pyxform/validators/pyxform/choices.py
new file mode 100644
index 00000000..c97298e7
--- /dev/null
+++ b/pyxform/validators/pyxform/choices.py
@@ -0,0 +1,59 @@
+from pyxform import constants
+from pyxform.errors import PyXFormError
+
+INVALID_NAME = (
+    "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
+    "Choices must have a name. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_LABEL = (
+    "[row : {row}] On the 'choices' sheet, the 'label' value is invalid. "
+    "Choices should have a label. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_HEADER = (
+    "[row : 1] On the 'choices' sheet, the '{column}' value is invalid. "
+    "Column headers must not be empty and must not contain spaces. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_DUPLICATE = (
+    "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
+    "Choice names must be unique for each choice list. "
+    "If this is intentional, use the setting 'allow_choice_duplicates'. "
+    "Learn more: https://xlsform.org/#choice-names."
+)
+
+
+def validate_headers(
+    headers: tuple[tuple[str, ...], ...], warnings: list[str]
+) -> list[str]:
+    def check():
+        for header in headers:
+            header = header[0]
+            if header != constants.LIST_NAME_S and (" " in header or header == ""):
+                warnings.append(INVALID_HEADER.format(column=header))
+                yield header
+
+    return list(check())
+
+
+def validate_choices(
+    options: list[dict], warnings: list[str], allow_duplicates: bool = False
+) -> None:
+    seen_options = set()
+    duplicate_errors = []
+    for option in options:
+        if "name" not in option:
+            raise PyXFormError(INVALID_NAME.format(row=option["__row"]))
+        elif "label" not in option:
+            warnings.append(INVALID_LABEL.format(row=option["__row"]))
+
+        if not allow_duplicates:
+            name = option["name"]
+            if name in seen_options:
+                duplicate_errors.append(INVALID_DUPLICATE.format(row=option["__row"]))
+            else:
+                seen_options.add(name)
+
+    if 0 < len(duplicate_errors):
+        raise PyXFormError("\n".join(duplicate_errors))
diff --git a/pyxform/validators/pyxform/question_types.py b/pyxform/validators/pyxform/question_types.py
index 7ec18edd..3c62673f 100644
--- a/pyxform/validators/pyxform/question_types.py
+++ b/pyxform/validators/pyxform/question_types.py
@@ -3,7 +3,7 @@
 """
 
 from pyxform.errors import PyXFormError
-from pyxform.parsing.expression import is_single_token_expression
+from pyxform.parsing.expression import is_pyxform_reference
 from pyxform.utils import PYXFORM_REFERENCE_REGEX
 
 BACKGROUND_GEOPOINT_CALCULATION = "[row : {r}] For 'background-geopoint' questions, the 'calculation' column must be empty."
@@ -25,9 +25,7 @@ def validate_background_geopoint_calculation(row: dict, row_num: int) -> bool:
 
 def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
     """A background-geopoint must have a trigger."""
-    if not row.get("trigger", False) or not is_single_token_expression(
-        expression=row["trigger"], token_types=["PYXFORM_REF"]
-    ):
+    if not row.get("trigger", False) or not is_pyxform_reference(value=row["trigger"]):
         raise PyXFormError(TRIGGER_INVALID.format(r=row_num, t=row["type"]))
     return True
 
diff --git a/pyxform/xlsparseutils.py b/pyxform/validators/pyxform/sheet_misspellings.py
similarity index 76%
rename from pyxform/xlsparseutils.py
rename to pyxform/validators/pyxform/sheet_misspellings.py
index 280f706a..c83fef31 100644
--- a/pyxform/xlsparseutils.py
+++ b/pyxform/validators/pyxform/sheet_misspellings.py
@@ -1,14 +1,8 @@
-import re
 from collections.abc import KeysView
 
 from pyxform import constants
 from pyxform.utils import levenshtein_distance
 
-# http://www.w3.org/TR/REC-xml/
-TAG_START_CHAR = r"[a-zA-Z:_]"
-TAG_CHAR = r"[a-zA-Z:_0-9\-.]"
-XFORM_TAG_REGEXP = re.compile(rf"^{TAG_START_CHAR}{TAG_CHAR}*$")
-
 
 def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
     """
@@ -36,10 +30,3 @@ def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
         return msg
     else:
         return None
-
-
-def is_valid_xml_tag(tag):
-    """
-    Use a regex to see if there are any invalid characters (i.e. spaces).
-    """
-    return re.search(XFORM_TAG_REGEXP, tag)
diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
index 3b260e22..7e4e6ebe 100644
--- a/pyxform/xls2json.py
+++ b/pyxform/xls2json.py
@@ -6,7 +6,6 @@
 import os
 import re
 import sys
-from collections import Counter
 from typing import IO, Any
 
 from pyxform import aliases, constants
@@ -21,15 +20,16 @@
     validate_entity_saveto,
 )
 from pyxform.errors import PyXFormError
-from pyxform.parsing.expression import is_single_token_expression
+from pyxform.parsing.expression import is_pyxform_reference, is_xml_tag
 from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic
+from pyxform.validators.pyxform import choices as vc
 from pyxform.validators.pyxform import parameters_generic, select_from_file
 from pyxform.validators.pyxform import question_types as qt
 from pyxform.validators.pyxform.android_package_name import validate_android_package_name
 from pyxform.validators.pyxform.pyxform_reference import validate_pyxform_reference_syntax
+from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
 from pyxform.validators.pyxform.translations_checks import SheetTranslations
 from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict
-from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
 
 SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'}
 RE_SMART_QUOTES = re.compile(r"|".join(re.escape(old) for old in SMART_QUOTES))
@@ -175,7 +175,12 @@ def dealias_types(dict_array):
     return dict_array
 
 
-def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool = False):
+def clean_text_values(
+    sheet_name: str,
+    data: list[dict],
+    strip_whitespace: bool = False,
+    add_row_number: bool = False,
+) -> list[dict]:
     """
     Go though the dict array and strips all text values.
     Also replaces multiple spaces with single spaces.
@@ -192,6 +197,8 @@ def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool
                 validate_pyxform_reference_syntax(
                     value=value, sheet_name=sheet_name, row_number=row_number, key=key
                 )
+        if add_row_number:
+            row["__row"] = row_number
     return data
 
 
@@ -513,7 +520,11 @@ def workbook_to_json(
 
     # ########## Choices sheet ##########
     choices_sheet = workbook_dict.get(constants.CHOICES, [])
-    choices_sheet = clean_text_values(sheet_name=constants.CHOICES, data=choices_sheet)
+    choices_sheet = clean_text_values(
+        sheet_name=constants.CHOICES,
+        data=choices_sheet,
+        add_row_number=True,
+    )
     choices_sheet = dealias_and_group_headers(
         dict_array=choices_sheet,
         header_aliases=aliases.list_header,
@@ -523,73 +534,27 @@ def workbook_to_json(
     choices = group_dictionaries_by_key(
         list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S
     )
-    if 0 < len(choices):
-        json_dict[constants.CHOICES] = choices
     # To combine the warning into one message, the check for missing choices translation
     # columns is run with Survey sheet below.
 
-    # Make sure all the options have the required properties:
-    warnedabout = set()
-    for list_name, options in choices.items():
+    # Warn and remove invalid headers in case the form uses headers for notes.
+    invalid_headers = vc.validate_headers(choices_sheet.headers, warnings)
+    allow_duplicates = aliases.yes_no.get(
+        settings.get("allow_choice_duplicates", False), False
+    )
+    for options in choices.values():
+        vc.validate_choices(
+            options=options,
+            warnings=warnings,
+            allow_duplicates=allow_duplicates,
+        )
         for option in options:
-            if "name" not in option:
-                info = "[list_name : " + list_name + "]"
-                raise PyXFormError(
-                    "On the choices sheet there is a option with no name. " + info
-                )
-            if "label" not in option:
-                info = "[list_name : " + list_name + "]"
-                warnings.append(
-                    "On the choices sheet there is a option with no label. " + info
-                )
-            # chrislrobert's fix for a cryptic error message:
-            # see: https://code.google.com/p/opendatakit/issues/detail?id=833&start=200
-            option_keys = list(option.keys())
-            for headername in option_keys:
-                # Using warnings and removing the bad columns
-                # instead of throwing errors because some forms
-                # use choices column headers for notes.
-                if " " in headername:
-                    if headername not in warnedabout:
-                        warnedabout.add(headername)
-                        warnings.append(
-                            "On the choices sheet there is "
-                            + 'a column ("'
-                            + headername
-                            + '") with an illegal header. '
-                            + "Headers cannot include spaces."
-                        )
-                    del option[headername]
-                elif headername == "":
-                    warnings.append(
-                        "On the choices sheet there is a value"
-                        + " in a column with no header."
-                    )
-                    del option[headername]
-        list_name_choices = [option.get("name") for option in options]
-        if len(list_name_choices) != len(set(list_name_choices)):
-            duplicate_setting = settings.get("allow_choice_duplicates")
-            for v in Counter(list_name_choices).values():
-                if v > 1:
-                    if not duplicate_setting or duplicate_setting.capitalize() != "Yes":
-                        choice_duplicates = [
-                            item
-                            for item, count in Counter(list_name_choices).items()
-                            if count > 1
-                        ]
-
-                        if choice_duplicates:
-                            raise PyXFormError(
-                                "The name column for the '{}' choice list contains these duplicates: {}. Duplicate names "
-                                "will be impossible to identify in analysis unless a previous value in a cascading "
-                                "select differentiates them. If this is intentional, you can set the "
-                                "allow_choice_duplicates setting to 'yes'. Learn more: https://xlsform.org/#choice-names.".format(
-                                    list_name,
-                                    ", ".join(
-                                        [f"'{dupe}'" for dupe in choice_duplicates]
-                                    ),
-                                )
-                            )
+            for invalid_header in invalid_headers:
+                option.pop(invalid_header, None)
+            del option["__row"]
+
+    if 0 < len(choices):
+        json_dict[constants.CHOICES] = choices
 
     # ########## Entities sheet ###########
     entities_sheet = workbook_dict.get(constants.ENTITIES, [])
@@ -945,7 +910,7 @@ def workbook_to_json(
                     ROW_FORMAT_STRING % row_number + " Question or group with no name."
                 )
         question_name = str(row[constants.NAME])
-        if not is_valid_xml_tag(question_name):
+        if not is_xml_tag(question_name):
             if isinstance(question_name, bytes):
                 question_name = question_name.decode("utf-8")
 
@@ -1022,10 +987,7 @@ def workbook_to_json(
                 repeat_count_expression = new_json_dict.get("control", {}).get("jr:count")
                 if repeat_count_expression:
                     # Simple expressions don't require a new node, they can reference directly.
-                    simple_expression = is_single_token_expression(
-                        expression=repeat_count_expression, token_types=["PYXFORM_REF"]
-                    )
-                    if not simple_expression:
+                    if not is_pyxform_reference(value=repeat_count_expression):
                         generated_node_name = new_json_dict["name"] + "_count"
                         parent_children_array.append(
                             {
diff --git a/tests/parsing/__init__.py b/tests/parsing/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/parsing/test_expression.py b/tests/parsing/test_expression.py
new file mode 100644
index 00000000..1fe3ad42
--- /dev/null
+++ b/tests/parsing/test_expression.py
@@ -0,0 +1,49 @@
+from pyxform.parsing.expression import is_xml_tag
+
+from tests.pyxform_test_case import PyxformTestCase
+
+positive = [
+    ("A", "Single uppercase letter"),
+    ("ab", "Lowercase letters"),
+    ("_u", "Leading underscore"),
+    ("A12", "Leading uppercase letter with digit"),
+    ("A-1.23", "Leading uppercase letter with hyphen, period, and digit"),
+    ("Name123-456", "Mixed case, digits, hyphen"),
+    ("𐐀n", "Leading unicode"),
+    ("Αλ", "Following unicode"),
+    ("name:name", "NCName, colon, NCName"),
+    ("name_with_colon:_and_extras", "NCName, colon, NCName (non-letter characters)"),
+    # Other special character tokens are excluded by ncname_regex.
+    ("nameor", "Contains another parser token (or)"),
+    ("nameand", "Contains another parser token (and)"),
+    ("namemod", "Contains another parser token (mod)"),
+    ("namediv", "Contains another parser token (div)"),
+]
+
+negative = [
+    ("", "Empty string"),
+    (" ", "Space"),
+    ("123name", "Leading digit"),
+    ("-name", "Leading hyphen"),
+    (".name", "Leading period"),
+    (":name", "Leading colon"),
+    ("name$", "Invalid character ($)"),
+    ("name with space", "Invalid character (space)"),
+    ("na@me", "Invalid character (@)"),
+    ("na#me", "Invalid character (#)"),
+    ("name:name:name", "Invalid structure (multiple colons)"),
+]
+
+
+class TestExpression(PyxformTestCase):
+    def test_is_xml_tag__positive(self):
+        """Should accept positive match cases i.e. valid xml tag names."""
+        for case, description in positive:
+            with self.subTest(case=case, description=description):
+                self.assertTrue(is_xml_tag(case))
+
+    def test_is_xml_tag__negative(self):
+        """Should reject negative match cases i.e. invalid xml tag names."""
+        for case, description in negative:
+            with self.subTest(case=case, description=description):
+                self.assertFalse(is_xml_tag(case))
diff --git a/tests/test_fields.py b/tests/test_fields.py
index 6c44fc9e..55591535 100644
--- a/tests/test_fields.py
+++ b/tests/test_fields.py
@@ -2,6 +2,8 @@
 Test duplicate survey question field name.
 """
 
+from pyxform.validators.pyxform import choices as vc
+
 from tests.pyxform_test_case import PyxformTestCase
 from tests.xpath_helpers.choices import xpc
 from tests.xpath_helpers.questions import xpq
@@ -57,9 +59,7 @@ def test_duplicate_choices_without_setting(self):
             |         | list            | b        | option c |
             """,
             errored=True,
-            error__contains=[
-                "The name column for the 'list' choice list contains these duplicates: 'b'"
-            ],
+            error__contains=[vc.INVALID_DUPLICATE.format(row=4)],
         )
 
     def test_multiple_duplicate_choices_without_setting(self):
@@ -77,8 +77,10 @@ def test_multiple_duplicate_choices_without_setting(self):
             """,
             errored=True,
             error__contains=[
-                "The name column for the 'list' choice list contains these duplicates: 'a', 'b'"
+                vc.INVALID_DUPLICATE.format(row=3),
+                vc.INVALID_DUPLICATE.format(row=5),
             ],
+            debug=True,
         )
 
     def test_duplicate_choices_with_setting_not_set_to_yes(self):
@@ -97,9 +99,7 @@ def test_duplicate_choices_with_setting_not_set_to_yes(self):
             |          | Duplicates   | Bob                       |
             """,
             errored=True,
-            error__contains=[
-                "The name column for the 'list' choice list contains these duplicates: 'b'"
-            ],
+            error__contains=[vc.INVALID_DUPLICATE.format(row=4)],
         )
 
     def test_duplicate_choices_with_allow_choice_duplicates_setting(self):
diff --git a/tests/test_sheet_columns.py b/tests/test_sheet_columns.py
index a5b85a42..3a7e708d 100644
--- a/tests/test_sheet_columns.py
+++ b/tests/test_sheet_columns.py
@@ -2,6 +2,8 @@
 Test XLSForm sheet names.
 """
 
+from pyxform.validators.pyxform import choices as vc
+
 from tests.pyxform_test_case import PyxformTestCase
 from tests.utils import prep_for_xml_contains
 from tests.xpath_helpers.choices import xpc
@@ -156,7 +158,7 @@ def test_invalid_choices_sheet_fails(self):
                 ]
             ),
             errored=True,
-            error__contains=["option with no name"],
+            error__contains=[vc.INVALID_NAME.format(row=2)],
         )
 
     def test_missing_list_name(self):
diff --git a/tests/test_xlsform_spec.py b/tests/test_xlsform_spec.py
index c3442fae..891c591b 100644
--- a/tests/test_xlsform_spec.py
+++ b/tests/test_xlsform_spec.py
@@ -1,3 +1,5 @@
+from pyxform.validators.pyxform import choices as vc
+
 from tests.pyxform_test_case import PyxformTestCase
 
 
@@ -61,10 +63,10 @@ def test_warnings__count(self):
         )
         self.maxDiff = 2000
         expected = [
-            "On the choices sheet there is a option with no label. [list_name : a_b]",
-            "On the choices sheet there is a option with no label. [list_name : a_b]",
-            "On the choices sheet there is a option with no label. [list_name : animals]",
-            "On the choices sheet there is a option with no label. [list_name : animals]",
+            vc.INVALID_LABEL.format(row=4),
+            vc.INVALID_LABEL.format(row=5),
+            vc.INVALID_LABEL.format(row=6),
+            vc.INVALID_LABEL.format(row=7),
             "[row : 9] Repeat has no label: {'name': 'repeat_test', 'type': 'begin repeat'}",
             "[row : 10] Group has no label: {'name': 'group_test', 'type': 'begin group'}",
             "[row : 17] Group has no label: {'name': 'name', 'type': 'begin group'}",
diff --git a/tests/xform_test_case/test_bugs.py b/tests/xform_test_case/test_bugs.py
index c022a812..c5db20d1 100644
--- a/tests/xform_test_case/test_bugs.py
+++ b/tests/xform_test_case/test_bugs.py
@@ -10,6 +10,7 @@
 from pyxform.errors import PyXFormError
 from pyxform.utils import has_external_choices
 from pyxform.validators.odk_validate import ODKValidateError, check_xform
+from pyxform.validators.pyxform import choices as vc
 from pyxform.xls2json import SurveyReader, parse_file_to_workbook_dict
 from pyxform.xls2json_backends import xlsx_to_dict
 from pyxform.xls2xform import convert
@@ -79,7 +80,11 @@ def test_conversion(self):
         )
         # The "column with no header" warning is probably not reachable since XLS/X
         # pre-processing ignores any columns without a header.
-        observed = [w for w in warnings if "Headers cannot include spaces" in w]
+        observed = [
+            w
+            for w in warnings
+            if w == vc.INVALID_HEADER.format(column="header with spaces")
+        ]
         self.assertEqual(1, len(observed), warnings)
 
     def test_values_with_spaces_are_cleaned(self):