Skip to content

Commit

Permalink
Restructure evaluations to be self contained
Browse files Browse the repository at this point in the history
  • Loading branch information
allenporter committed May 27, 2024
1 parent 96afa27 commit 33b4f7f
Show file tree
Hide file tree
Showing 99 changed files with 2,879 additions and 417 deletions.
21 changes: 20 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ repos:
- id: check-yaml
args:
- --allow-multiple-documents
exclude: >
(?x)^(
.*/output/.*|
model_outputs/.*
)$
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 24.4.2
Expand Down Expand Up @@ -35,18 +40,32 @@ repos:
- id: codespell
args:
- --ignore-words-list=hass
exclude: >
(?x)^(
.*/output/.*|
model_outputs/.*
)$
- repo: https://github.com/adrienverge/yamllint.git
rev: v1.35.1
hooks:
- id: yamllint
exclude: '^tests/tool/testdata/.*\.yaml$'
exclude: >
(?x)^(
model_outputs/.*
model_eval/.*/output/.*
)$
args:
- -c
- ".yaml-lint.yaml"
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.1.0
hooks:
- id: prettier
exclude: >
(?x)^(
.*/output/.*|
model_outputs/.*
)$
- repo: https://github.com/asottile/setup-cfg-fmt
rev: v2.5.0
hooks:
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ Human rater (me) scores the result quality:
- 2: Medium: Solid, not incorrect, though perhaps a missed opportunity
- 3: High: Good

See the [annotations/](annotations/) directory for the human eval procedure
using Doccano.
See the [script/](script/) directory for more details on preparing the data for
human eval procedure using Doccano.

## Visualization (WIP)

Expand Down
67 changes: 0 additions & 67 deletions annotations/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion datasets/device-actions/desert-retreat-us.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ home: desert-retreat-us
actions:
- sentences:
- Set thermostat to 73
- Set thermostat to 73 Farenheight
- Set thermostat to 73 Fahrenheit
device_states:
- name: Bedroom 1 Thermostat
area: Bedroom 1
Expand Down
52 changes: 0 additions & 52 deletions metrics/README.md

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from collections.abc import Generator, Callable
import logging
import pathlib
import asyncio
from dataclasses import dataclass
import hashlib
import uuid
Expand All @@ -17,7 +16,6 @@
from homeassistant.core import HomeAssistant



from .conftest import ConversationAgent, EvalRecordWriter
from .common import ModelConfig

Expand All @@ -26,7 +24,7 @@

MODEL_EVAL_OUTPUT = "model_outputs/anomaly"
DATASET_FILE = "datasets/anomaly/anomaly.yaml"
VALID_LABELS = ['normal', 'anomaly']
VALID_LABELS = ["normal", "anomaly"]

BASE_PROMPT = """
You are a Home Automation Agent that will classify the state of an area of a
Expand All @@ -52,11 +50,11 @@
@pytest.fixture(
name="model_id",
params=[
#"llama3",
# "llama3",
"gemma",
#"mistral-7b-instruct",
#"gemini-pro",
#"gpt-3.5",
# "mistral-7b-instruct",
# "gemini-pro",
# "gpt-3.5",
],
)
def model_id_fixture(request: pytest.FixtureRequest) -> str:
Expand Down Expand Up @@ -127,13 +125,13 @@ def mock_dataset_records() -> list[dict[str, str]]:
@pytest.fixture(name="eval_split")
def mock_eval_records(dataset_records: list[dict[str, str]]) -> list[dict[str, str]]:
"""Fixture to read the dataset yaml contennts."""
return dataset_records[:int(EVAL_RECORDS_SPLIT * len(dataset_records))]
return dataset_records[: int(EVAL_RECORDS_SPLIT * len(dataset_records))]


@pytest.fixture(name="fewshot_split")
def mock_fewshot_records(dataset_records: list[dict[str, str]]) -> list[dict[str, str]]:
"""Fixture to read the dataset yaml contennts."""
return dataset_records[int(EVAL_RECORDS_SPLIT * len(dataset_records)):]
return dataset_records[int(EVAL_RECORDS_SPLIT * len(dataset_records)) :]


@pytest.fixture(name="tasks_provider")
Expand All @@ -155,9 +153,10 @@ def func() -> Generator[LabelTask, None, None]:
return func



@pytest.fixture(name="system_prompt")
def mock_system_prompt(prompt_label: str, fewshot_split: list[dict[str, str]]) -> list[dict[str, str]]:
def mock_system_prompt(
prompt_label: str, fewshot_split: list[dict[str, str]]
) -> list[dict[str, str]]:
"""Fixture to read the dataset yaml contennts."""
records = fewshot_split
random.shuffle(records)
Expand Down
1 change: 1 addition & 0 deletions model_evals/area_summary/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Area summary evaluations."""
5 changes: 5 additions & 0 deletions model_evals/area_summary/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Configure test fixtures."""

pytest_plugins = [
"model_evals.common.conftest",
]
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""An evaluation for the Summary Agent custom component summarizing an area with pruned context."""
"""An evaluation for the Summary Agent custom component summarizing an area with pruned context.
This generates the tasks to evaluate based on the active areas and devices in the home.
"""

from collections.abc import Generator, Callable, Awaitable
import logging
Expand All @@ -23,17 +26,15 @@

from pytest_homeassistant_custom_component.common import MockConfigEntry

from custom_components.synthetic_home.home_model.device_types import (
load_restorable_attributes,
)
from synthetic_home.device_types import load_device_type_registry

from .conftest import ConversationAgent, EvalRecordWriter
from .common import HomeAssistantContext, ModelConfig
from model_evals.common.conftest import ConversationAgent, EvalRecordWriter
from model_evals.common.common import HomeAssistantContext


_LOGGER = logging.getLogger(__name__)

MODEL_EVAL_OUTPUT = "model_outputs/area_summary_agent"
MODEL_EVAL_OUTPUT = "model_evals/area_summary/output/area_summary_agent"

STRIP_PREFIX = "Summary: "

Expand Down Expand Up @@ -97,31 +98,28 @@ def cleanup_response(response: str) -> str:
return response[index + len(STRIP_PREFIX) :]


@pytest.fixture(name="eval_record_writer")
def eval_record_writer_fixture(
hass: HomeAssistant, model_config: ModelConfig, synthetic_home_config: str
) -> Generator[EvalRecordWriter, None, None]:
"""Fixture that prepares the eval output writer."""
writer = EvalRecordWriter(
pathlib.Path(MODEL_EVAL_OUTPUT) / model_config.model_id,
pathlib.Path(synthetic_home_config).name,
)
writer.open()
yield writer
writer.close()
@pytest.fixture(name="eval_output_file")
def eval_output_file_fixture(model_id: str, synthetic_home_config: str) -> str:
"""Sets the output filename for the evaluation run.
This output file needs to be unique across the test instances to avoid overwriting. For
example if you add a parameter based on the system prompt then this needs to create
a separate file containing an id of the prompt.
"""
home_file = pathlib.Path(synthetic_home_config).name
return pathlib.Path(f"{MODEL_EVAL_OUTPUT}/{model_id}/{home_file}")

@dataclass
class SyntheticDeviceState:
class DeviceState:
"""Information needed to set the synthetic state for an evaluation task."""

device_name: str
restorable_attribute: str
device_state: str

@property
def state_label(self) -> str:
"""Identifier about the state of the devices under evaluation"""
return f"{slugify(self.device_name)}-{slugify(self.restorable_attribute)}"
return f"{slugify(self.device_name)}-{slugify(self.device_state)}"


@dataclass
Expand All @@ -140,7 +138,7 @@ class AreaSummaryTask:
area_name: str
"""Area name within the home that is being summarized and evaluated."""

device_state: SyntheticDeviceState
device_state: DeviceState
"""The device state details about the state of the devices under evaluation"""

@property
Expand All @@ -165,23 +163,27 @@ def tasks_provider_fixture(
area_entries = list(area_registry.async_list_areas())
_LOGGER.info("Loaded %s areas to evaluate", len(area_entries))

device_type_registry = load_device_type_registry()

def func() -> Generator[AreaSummaryTask, None, None]:
for area_entry in area_entries:
area_name = area_entry.name

if (devices := synthetic_home_config["devices"].get(area_name)) is None:
return
for device_info in devices:
attributes = load_restorable_attributes(device_info["device_type"])
for attribute in attributes:
device_state = SyntheticDeviceState(device_info["name"], attribute)

device_type = device_type_registry.device_types[
device_info["device_type"]
]
for device_state in device_type.device_states:
yield AreaSummaryTask(
home_id=home_id,
home_name=home_name,
area_id=area_entry.id,
area_name=area_name,
device_state=device_state,
device_state=DeviceState(
device_info["name"], device_state.name
),
)

return func
Expand Down Expand Up @@ -219,7 +221,7 @@ async def func(area_summary_task: AreaSummaryTask) -> None:
_LOGGER.info(
"Changing device state for %s to %s",
device_state.device_name,
device_state.restorable_attribute,
device_state.device_state,
)
await hass.services.async_call(
"synthetic_home",
Expand All @@ -228,7 +230,7 @@ async def func(area_summary_task: AreaSummaryTask) -> None:
"config_entry_id": synthetic_home_config_entry.entry_id,
"area": area_summary_task.area_name,
"device": device_state.device_name,
"restorable_attribute_key": device_state.restorable_attribute,
"device_state_key": device_state.device_state,
},
blocking=True,
)
Expand Down
Loading

0 comments on commit 33b4f7f

Please sign in to comment.