Restructure evaluations to be self contained

tannisroot · May 27, 2024 · 33b4f7f · 33b4f7f
1 parent 96afa27
commit 33b4f7f
Show file tree

Hide file tree

Showing 99 changed files with 2,879 additions and 417 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,6 +8,11 @@ repos:
       - id: check-yaml
         args:
           - --allow-multiple-documents
+        exclude: >
+          (?x)^(
+              .*/output/.*|
+              model_outputs/.*
+          )$
       - id: check-added-large-files
   - repo: https://github.com/psf/black
     rev: 24.4.2
@@ -35,18 +40,32 @@ repos:
       - id: codespell
         args:
           - --ignore-words-list=hass
+        exclude: >
+          (?x)^(
+              .*/output/.*|
+              model_outputs/.*
+          )$
   - repo: https://github.com/adrienverge/yamllint.git
     rev: v1.35.1
     hooks:
       - id: yamllint
-        exclude: '^tests/tool/testdata/.*\.yaml$'
+        exclude: >
+          (?x)^(
+              model_outputs/.*
+              model_eval/.*/output/.*
+          )$
         args:
           - -c
           - ".yaml-lint.yaml"
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v3.1.0
     hooks:
       - id: prettier
+        exclude: >
+          (?x)^(
+              .*/output/.*|
+              model_outputs/.*
+          )$
   - repo: https://github.com/asottile/setup-cfg-fmt
     rev: v2.5.0
     hooks:

diff --git a/README.md b/README.md
@@ -131,8 +131,8 @@ Human rater (me) scores the result quality:
   - 2: Medium: Solid, not incorrect, though perhaps a missed opportunity
   - 3: High: Good
 
-See the [annotations/](annotations/) directory for the human eval procedure
-using Doccano.
+See the [script/](script/) directory for more details on preparing the data for
+human eval procedure using Doccano.
 
 ## Visualization (WIP)
 

diff --git a/annotations/README.md b/annotations/README.md
diff --git a/datasets/device-actions/desert-retreat-us.yaml b/datasets/device-actions/desert-retreat-us.yaml
@@ -3,7 +3,7 @@ home: desert-retreat-us
 actions:
   - sentences:
       - Set thermostat to 73
-      - Set thermostat to 73 Farenheight
+      - Set thermostat to 73 Fahrenheit
     device_states:
       - name: Bedroom 1 Thermostat
         area: Bedroom 1

diff --git a/metrics/README.md b/metrics/README.md
diff --git a/model_outputs/anomaly/gemma-10-shot.yaml → ...l_evals/anomaly/output/gemma-10-shot.yaml b/model_outputs/anomaly/gemma-10-shot.yaml → ...l_evals/anomaly/output/gemma-10-shot.yaml
diff --git a/model_outputs/anomaly/gemma-3-shot.yaml → model_evals/anomaly/output/gemma-3-shot.yaml b/model_outputs/anomaly/gemma-3-shot.yaml → model_evals/anomaly/output/gemma-3-shot.yaml
diff --git a/model_outputs/anomaly/gemma-5-shot.yaml → model_evals/anomaly/output/gemma-5-shot.yaml b/model_outputs/anomaly/gemma-5-shot.yaml → model_evals/anomaly/output/gemma-5-shot.yaml
diff --git a/model_outputs/anomaly/gemma-zero-shot.yaml → ...evals/anomaly/output/gemma-zero-shot.yaml b/model_outputs/anomaly/gemma-zero-shot.yaml → ...evals/anomaly/output/gemma-zero-shot.yaml
diff --git a/model_outputs/anomaly/gpt-3.5-10-shot.yaml → ...evals/anomaly/output/gpt-3.5-10-shot.yaml b/model_outputs/anomaly/gpt-3.5-10-shot.yaml → ...evals/anomaly/output/gpt-3.5-10-shot.yaml
diff --git a/model_outputs/anomaly/gpt-3.5-zero-shot.yaml → ...als/anomaly/output/gpt-3.5-zero-shot.yaml b/model_outputs/anomaly/gpt-3.5-zero-shot.yaml → ...als/anomaly/output/gpt-3.5-zero-shot.yaml
diff --git a/model_outputs/anomaly/llama3-10-shot.yaml → ..._evals/anomaly/output/llama3-10-shot.yaml b/model_outputs/anomaly/llama3-10-shot.yaml → ..._evals/anomaly/output/llama3-10-shot.yaml
diff --git a/model_outputs/anomaly/llama3-zero-shot.yaml → ...vals/anomaly/output/llama3-zero-shot.yaml b/model_outputs/anomaly/llama3-zero-shot.yaml → ...vals/anomaly/output/llama3-zero-shot.yaml
diff --git a/.../anomaly/mistral-7b-instruct-10-shot.yaml → ...y/output/mistral-7b-instruct-10-shot.yaml b/.../anomaly/mistral-7b-instruct-10-shot.yaml → ...y/output/mistral-7b-instruct-10-shot.yaml
diff --git a/...nomaly/mistral-7b-instruct-zero-shot.yaml → ...output/mistral-7b-instruct-zero-shot.yaml b/...nomaly/mistral-7b-instruct-zero-shot.yaml → ...output/mistral-7b-instruct-zero-shot.yaml
diff --git a/model_evals/test_anomaly_eval.py → model_evals/anomaly/test_anomaly_eval.py b/model_evals/test_anomaly_eval.py → model_evals/anomaly/test_anomaly_eval.py
@@ -3,7 +3,6 @@
 from collections.abc import Generator, Callable
 import logging
 import pathlib
-import asyncio
 from dataclasses import dataclass
 import hashlib
 import uuid
@@ -17,7 +16,6 @@
 from homeassistant.core import HomeAssistant
 
 
-
 from .conftest import ConversationAgent, EvalRecordWriter
 from .common import ModelConfig
 
@@ -26,7 +24,7 @@
 
 MODEL_EVAL_OUTPUT = "model_outputs/anomaly"
 DATASET_FILE = "datasets/anomaly/anomaly.yaml"
-VALID_LABELS = ['normal', 'anomaly']
+VALID_LABELS = ["normal", "anomaly"]
 
 BASE_PROMPT = """
 You are a Home Automation Agent that will classify the state of an area of a
@@ -52,11 +50,11 @@
 @pytest.fixture(
     name="model_id",
     params=[
-        #"llama3",
+        # "llama3",
         "gemma",
-        #"mistral-7b-instruct",
-        #"gemini-pro",
-        #"gpt-3.5",
+        # "mistral-7b-instruct",
+        # "gemini-pro",
+        # "gpt-3.5",
     ],
 )
 def model_id_fixture(request: pytest.FixtureRequest) -> str:
@@ -127,13 +125,13 @@ def mock_dataset_records() -> list[dict[str, str]]:
 @pytest.fixture(name="eval_split")
 def mock_eval_records(dataset_records: list[dict[str, str]]) -> list[dict[str, str]]:
     """Fixture to read the dataset yaml contennts."""
-    return dataset_records[:int(EVAL_RECORDS_SPLIT * len(dataset_records))]
+    return dataset_records[: int(EVAL_RECORDS_SPLIT * len(dataset_records))]
 
 
 @pytest.fixture(name="fewshot_split")
 def mock_fewshot_records(dataset_records: list[dict[str, str]]) -> list[dict[str, str]]:
     """Fixture to read the dataset yaml contennts."""
-    return dataset_records[int(EVAL_RECORDS_SPLIT * len(dataset_records)):]
+    return dataset_records[int(EVAL_RECORDS_SPLIT * len(dataset_records)) :]
 
 
 @pytest.fixture(name="tasks_provider")
@@ -155,9 +153,10 @@ def func() -> Generator[LabelTask, None, None]:
     return func
 
 
-
 @pytest.fixture(name="system_prompt")
-def mock_system_prompt(prompt_label: str, fewshot_split: list[dict[str, str]]) -> list[dict[str, str]]:
+def mock_system_prompt(
+    prompt_label: str, fewshot_split: list[dict[str, str]]
+) -> list[dict[str, str]]:
     """Fixture to read the dataset yaml contennts."""
     records = fewshot_split
     random.shuffle(records)

diff --git a/model_evals/area_summary/__init__.py b/model_evals/area_summary/__init__.py
@@ -0,0 +1 @@
+"""Area summary evaluations."""
diff --git a/model_evals/area_summary/conftest.py b/model_evals/area_summary/conftest.py
@@ -0,0 +1,5 @@
+"""Configure test fixtures."""
+
+pytest_plugins = [
+    "model_evals.common.conftest",
+]
diff --git a/model_outputs/area_summary/annotations.yaml → ...mary/output/area_summary/annotations.yaml b/model_outputs/area_summary/annotations.yaml → ...mary/output/area_summary/annotations.yaml
diff --git a/...ea_summary/gemini-pro/apartament4-pl.yaml → ...ea_summary/gemini-pro/apartament4-pl.yaml b/...ea_summary/gemini-pro/apartament4-pl.yaml → ...ea_summary/gemini-pro/apartament4-pl.yaml
diff --git a/...mini-pro/casa-adosada-en-la-costa-es.yaml → ...mini-pro/casa-adosada-en-la-costa-es.yaml b/...mini-pro/casa-adosada-en-la-costa-es.yaml → ...mini-pro/casa-adosada-en-la-costa-es.yaml
diff --git a/...uts/area_summary/gemini-pro/home1-us.yaml → ...put/area_summary/gemini-pro/home1-us.yaml b/...uts/area_summary/gemini-pro/home1-us.yaml → ...put/area_summary/gemini-pro/home1-us.yaml
diff --git a/...mmary/gemini-pro/lakeside-retreat-de.yaml → ...mmary/gemini-pro/lakeside-retreat-de.yaml b/...mmary/gemini-pro/lakeside-retreat-de.yaml → ...mmary/gemini-pro/lakeside-retreat-de.yaml
diff --git a/...ts/area_summary/gemma/apartament4-pl.yaml → ...ut/area_summary/gemma/apartament4-pl.yaml b/...ts/area_summary/gemma/apartament4-pl.yaml → ...ut/area_summary/gemma/apartament4-pl.yaml
diff --git a/...ry/gemma/casa-adosada-en-la-costa-es.yaml → ...ry/gemma/casa-adosada-en-la-costa-es.yaml b/...ry/gemma/casa-adosada-en-la-costa-es.yaml → ...ry/gemma/casa-adosada-en-la-costa-es.yaml
diff --git a/..._outputs/area_summary/gemma/home1-us.yaml → ...y/output/area_summary/gemma/home1-us.yaml b/..._outputs/area_summary/gemma/home1-us.yaml → ...y/output/area_summary/gemma/home1-us.yaml
diff --git a/...ea_summary/gemma/lakeside-retreat-de.yaml → ...ea_summary/gemma/lakeside-retreat-de.yaml b/...ea_summary/gemma/lakeside-retreat-de.yaml → ...ea_summary/gemma/lakeside-retreat-de.yaml
diff --git a/.../area_summary/gpt-3.5/apartament4-pl.yaml → .../area_summary/gpt-3.5/apartament4-pl.yaml b/.../area_summary/gpt-3.5/apartament4-pl.yaml → .../area_summary/gpt-3.5/apartament4-pl.yaml
diff --git a/.../gpt-3.5/casa-adosada-en-la-costa-es.yaml → .../gpt-3.5/casa-adosada-en-la-costa-es.yaml b/.../gpt-3.5/casa-adosada-en-la-costa-es.yaml → .../gpt-3.5/casa-adosada-en-la-costa-es.yaml
diff --git a/...utputs/area_summary/gpt-3.5/home1-us.yaml → ...output/area_summary/gpt-3.5/home1-us.yaml b/...utputs/area_summary/gpt-3.5/home1-us.yaml → ...output/area_summary/gpt-3.5/home1-us.yaml
diff --git a/..._summary/gpt-3.5/lakeside-retreat-de.yaml → ..._summary/gpt-3.5/lakeside-retreat-de.yaml b/..._summary/gpt-3.5/lakeside-retreat-de.yaml → ..._summary/gpt-3.5/lakeside-retreat-de.yaml
diff --git a/...y/mistral-7b-instruct/apartament4-pl.yaml → ...y/mistral-7b-instruct/apartament4-pl.yaml b/...y/mistral-7b-instruct/apartament4-pl.yaml → ...y/mistral-7b-instruct/apartament4-pl.yaml
diff --git a/...instruct/casa-adosada-en-la-costa-es.yaml → ...instruct/casa-adosada-en-la-costa-es.yaml b/...instruct/casa-adosada-en-la-costa-es.yaml → ...instruct/casa-adosada-en-la-costa-es.yaml
diff --git a/...summary/mistral-7b-instruct/home1-us.yaml → ...summary/mistral-7b-instruct/home1-us.yaml b/...summary/mistral-7b-instruct/home1-us.yaml → ...summary/mistral-7b-instruct/home1-us.yaml
diff --git a/...tral-7b-instruct/lakeside-retreat-de.yaml → ...tral-7b-instruct/lakeside-retreat-de.yaml b/...tral-7b-instruct/lakeside-retreat-de.yaml → ...tral-7b-instruct/lakeside-retreat-de.yaml
diff --git a/...puts/area_summary_agent/annotations.jsonl → ...tput/area_summary_agent/annotations.jsonl b/...puts/area_summary_agent/annotations.jsonl → ...tput/area_summary_agent/annotations.jsonl
diff --git a/...tputs/area_summary_agent/annotations.yaml → ...utput/area_summary_agent/annotations.yaml b/...tputs/area_summary_agent/annotations.yaml → ...utput/area_summary_agent/annotations.yaml
diff --git a/...l_outputs/area_summary_agent/dataset.json → ...ry/output/area_summary_agent/dataset.json b/...l_outputs/area_summary_agent/dataset.json → ...ry/output/area_summary_agent/dataset.json
diff --git a/...mary_agent/gemini-pro/apartament4-pl.yaml → ...mary_agent/gemini-pro/apartament4-pl.yaml b/...mary_agent/gemini-pro/apartament4-pl.yaml → ...mary_agent/gemini-pro/apartament4-pl.yaml
diff --git a/...mini-pro/casa-adosada-en-la-costa-es.yaml → ...mini-pro/casa-adosada-en-la-costa-es.yaml b/...mini-pro/casa-adosada-en-la-costa-es.yaml → ...mini-pro/casa-adosada-en-la-costa-es.yaml
diff --git a/...ea_summary_agent/gemini-pro/home1-us.yaml → ...ea_summary_agent/gemini-pro/home1-us.yaml b/...ea_summary_agent/gemini-pro/home1-us.yaml → ...ea_summary_agent/gemini-pro/home1-us.yaml
diff --git a/...agent/gemini-pro/lakeside-retreat-de.yaml → ...agent/gemini-pro/lakeside-retreat-de.yaml b/...agent/gemini-pro/lakeside-retreat-de.yaml → ...agent/gemini-pro/lakeside-retreat-de.yaml
diff --git a/...ts/area_summary_agent/gemma/home1-us.yaml → ...ut/area_summary_agent/gemma/home1-us.yaml b/...ts/area_summary_agent/gemma/home1-us.yaml → ...ut/area_summary_agent/gemma/home1-us.yaml
diff --git a/...summary_agent/gpt-3.5/apartament4-pl.yaml → ...summary_agent/gpt-3.5/apartament4-pl.yaml b/...summary_agent/gpt-3.5/apartament4-pl.yaml → ...summary_agent/gpt-3.5/apartament4-pl.yaml
diff --git a/.../gpt-3.5/casa-adosada-en-la-costa-es.yaml → .../gpt-3.5/casa-adosada-en-la-costa-es.yaml b/.../gpt-3.5/casa-adosada-en-la-costa-es.yaml → .../gpt-3.5/casa-adosada-en-la-costa-es.yaml
diff --git a/.../area_summary_agent/gpt-3.5/home1-us.yaml → .../area_summary_agent/gpt-3.5/home1-us.yaml b/.../area_summary_agent/gpt-3.5/home1-us.yaml → .../area_summary_agent/gpt-3.5/home1-us.yaml
diff --git a/...ry_agent/gpt-3.5/lakeside-retreat-de.yaml → ...ry_agent/gpt-3.5/lakeside-retreat-de.yaml b/...ry_agent/gpt-3.5/lakeside-retreat-de.yaml → ...ry_agent/gpt-3.5/lakeside-retreat-de.yaml
diff --git a/...t/mistral-7b-instruct/apartament4-pl.yaml → ...t/mistral-7b-instruct/apartament4-pl.yaml b/...t/mistral-7b-instruct/apartament4-pl.yaml → ...t/mistral-7b-instruct/apartament4-pl.yaml
diff --git a/...instruct/casa-adosada-en-la-costa-es.yaml → ...instruct/casa-adosada-en-la-costa-es.yaml b/...instruct/casa-adosada-en-la-costa-es.yaml → ...instruct/casa-adosada-en-la-costa-es.yaml
diff --git a/...y_agent/mistral-7b-instruct/home1-us.yaml → ...y_agent/mistral-7b-instruct/home1-us.yaml b/...y_agent/mistral-7b-instruct/home1-us.yaml → ...y_agent/mistral-7b-instruct/home1-us.yaml
diff --git a/...tral-7b-instruct/lakeside-retreat-de.yaml → ...tral-7b-instruct/lakeside-retreat-de.yaml b/...tral-7b-instruct/lakeside-retreat-de.yaml → ...tral-7b-instruct/lakeside-retreat-de.yaml
diff --git a/model_evals/test_area_summary_agent_eval.py → ...a_summary/test_area_summary_agent_eval.py b/model_evals/test_area_summary_agent_eval.py → ...a_summary/test_area_summary_agent_eval.py
@@ -1,4 +1,7 @@
-"""An evaluation for the Summary Agent custom component summarizing an area with pruned context."""
+"""An evaluation for the Summary Agent custom component summarizing an area with pruned context.
+
+This generates the tasks to evaluate based on the active areas and devices in the home.
+"""
 
 from collections.abc import Generator, Callable, Awaitable
 import logging
@@ -23,17 +26,15 @@
 
 from pytest_homeassistant_custom_component.common import MockConfigEntry
 
-from custom_components.synthetic_home.home_model.device_types import (
-    load_restorable_attributes,
-)
+from synthetic_home.device_types import load_device_type_registry
 
-from .conftest import ConversationAgent, EvalRecordWriter
-from .common import HomeAssistantContext, ModelConfig
+from model_evals.common.conftest import ConversationAgent, EvalRecordWriter
+from model_evals.common.common import HomeAssistantContext
 
 
 _LOGGER = logging.getLogger(__name__)
 
-MODEL_EVAL_OUTPUT = "model_outputs/area_summary_agent"
+MODEL_EVAL_OUTPUT = "model_evals/area_summary/output/area_summary_agent"
 
 STRIP_PREFIX = "Summary: "
 
@@ -97,31 +98,28 @@ def cleanup_response(response: str) -> str:
     return response[index + len(STRIP_PREFIX) :]
 
 
-@pytest.fixture(name="eval_record_writer")
-def eval_record_writer_fixture(
-    hass: HomeAssistant, model_config: ModelConfig, synthetic_home_config: str
-) -> Generator[EvalRecordWriter, None, None]:
-    """Fixture that prepares the eval output writer."""
-    writer = EvalRecordWriter(
-        pathlib.Path(MODEL_EVAL_OUTPUT) / model_config.model_id,
-        pathlib.Path(synthetic_home_config).name,
-    )
-    writer.open()
-    yield writer
-    writer.close()
+@pytest.fixture(name="eval_output_file")
+def eval_output_file_fixture(model_id: str, synthetic_home_config: str) -> str:
+    """Sets the output filename for the evaluation run.
 
+    This output file needs to be unique across the test instances to avoid overwriting. For
+    example if you add a parameter based on the system prompt then this needs to create
+    a separate file containing an id of the prompt.
+    """
+    home_file = pathlib.Path(synthetic_home_config).name
+    return pathlib.Path(f"{MODEL_EVAL_OUTPUT}/{model_id}/{home_file}")
 
 @dataclass
-class SyntheticDeviceState:
+class DeviceState:
     """Information needed to set the synthetic state for an evaluation task."""
 
     device_name: str
-    restorable_attribute: str
+    device_state: str
 
     @property
     def state_label(self) -> str:
         """Identifier about the state of the devices under evaluation"""
-        return f"{slugify(self.device_name)}-{slugify(self.restorable_attribute)}"
+        return f"{slugify(self.device_name)}-{slugify(self.device_state)}"
 
 
 @dataclass
@@ -140,7 +138,7 @@ class AreaSummaryTask:
     area_name: str
     """Area name within the home that is being summarized and evaluated."""
 
-    device_state: SyntheticDeviceState
+    device_state: DeviceState
     """The device state details about the state of the devices under evaluation"""
 
     @property
@@ -165,23 +163,27 @@ def tasks_provider_fixture(
     area_entries = list(area_registry.async_list_areas())
     _LOGGER.info("Loaded %s areas to evaluate", len(area_entries))
 
+    device_type_registry = load_device_type_registry()
+
     def func() -> Generator[AreaSummaryTask, None, None]:
         for area_entry in area_entries:
             area_name = area_entry.name
 
             if (devices := synthetic_home_config["devices"].get(area_name)) is None:
                 return
             for device_info in devices:
-                attributes = load_restorable_attributes(device_info["device_type"])
-                for attribute in attributes:
-                    device_state = SyntheticDeviceState(device_info["name"], attribute)
-
+                device_type = device_type_registry.device_types[
+                    device_info["device_type"]
+                ]
+                for device_state in device_type.device_states:
                     yield AreaSummaryTask(
                         home_id=home_id,
                         home_name=home_name,
                         area_id=area_entry.id,
                         area_name=area_name,
-                        device_state=device_state,
+                        device_state=DeviceState(
+                            device_info["name"], device_state.name
+                        ),
                     )
 
     return func
@@ -219,7 +221,7 @@ async def func(area_summary_task: AreaSummaryTask) -> None:
         _LOGGER.info(
             "Changing device state for %s to %s",
             device_state.device_name,
-            device_state.restorable_attribute,
+            device_state.device_state,
         )
         await hass.services.async_call(
             "synthetic_home",
@@ -228,7 +230,7 @@ async def func(area_summary_task: AreaSummaryTask) -> None:
                 "config_entry_id": synthetic_home_config_entry.entry_id,
                 "area": area_summary_task.area_name,
                 "device": device_state.device_name,
-                "restorable_attribute_key": device_state.restorable_attribute,
+                "device_state_key": device_state.device_state,
             },
             blocking=True,
         )