fix: minor eval refactor (improved type names), clarified python tool…

… instructions
ErikBjare · Sep 19, 2024 · e0c79a4 · e0c79a4
1 parent a837b32
commit e0c79a4
Show file tree

Hide file tree

Showing 14 changed files with 108 additions and 84 deletions.
diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ test:
 	@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
 	poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
 		--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html --junitxml=junit.xml \
-		-n 8 \
+		-n 16 \
 		$(if $(EVAL), , -m "not eval") \
 		$(if $(SLOW), --timeout 60 --retries 2 --retry-delay 5, --timeout 5 -m "not slow and not eval") \
 		$(if $(PROFILE), --profile-svg)

diff --git a/gptme/eval/main.py b/gptme/eval/main.py
@@ -20,7 +20,7 @@
 from ..message import len_tokens
 from .run import run_evals
 from .suites import suites, tests_default, tests_map
-from .types import CaseResult, ExecResult, ExecTest
+from .types import CaseResult, EvalResult, EvalSpec
 
 # Configure logging, including fully-qualified module names
 logging.basicConfig(
@@ -34,7 +34,7 @@
 project_dir = Path(__file__).parent.parent.parent
 
 
-def print_model_results(model_results: dict[str, list[ExecResult]]):
+def print_model_results(model_results: dict[str, list[EvalResult]]):
     total_tests = 0
     total_tokens = 0
 
@@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]):
     print(f"Completed {total_tests} tests in {total_tokens}tok")
 
 
-def print_model_results_table(model_results: dict[str, list[ExecResult]]):
+def print_model_results_table(model_results: dict[str, list[EvalResult]]):
     test_names = {
         result.name for results in model_results.values() for result in results
     }
@@ -120,19 +120,23 @@ def main(
 ):
     """
     Run evals for gptme.
+    Pass eval or suite names to run, or result files to print.
 
-    Pass test names to run, or result files to print.
+    Output from evals will be captured, unless a single eval is run, and saved to the results directory.
     """
     # init
     multiprocessing_logging.install_mp_handler()
 
     models = _model or [
         "openai/gpt-4o",
+        "openai/gpt-4o-mini",
         "anthropic/claude-3-5-sonnet-20240620",
+        "anthropic/claude-3-haiku-20240307",
         "openrouter/meta-llama/llama-3.1-405b-instruct",
     ]
 
     results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
+    eval_names = [f for f in eval_names_or_result_files if f not in results_files]
     if results_files:
         for results_file in results_files:
             p = Path(results_file)
@@ -148,20 +152,20 @@ def main(
                 sys.exit(1)
         sys.exit(0)
 
-    tests_to_run: list[ExecTest] = []
-    for test_name in eval_names_or_result_files:
-        if test_name in tests_map:
-            tests_to_run.append(tests_map[test_name])
-        elif test_name in suites:
-            tests_to_run.extend(suites[test_name])
+    evals_to_run: list[EvalSpec] = []
+    for eval_name in eval_names:
+        if test := tests_map.get(eval_name):
+            evals_to_run.append(test)
+        elif suite := suites.get(eval_name) or suites.get(eval_name.replace("-", "_")):
+            evals_to_run.extend(suite)
         else:
-            raise ValueError(f"Test {test_name} not found")
+            raise ValueError(f"Test {eval_name} not found")
 
-    if not tests_to_run:
-        tests_to_run = tests_default
+    if not evals_to_run:
+        evals_to_run = tests_default
 
     print("=== Running evals ===")
-    model_results = run_evals(tests_to_run, models, timeout, parallel)
+    model_results = run_evals(evals_to_run, models, timeout, parallel)
     print("\n=== Finished ===\n")
 
     print("\n=== Model Results ===")
@@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str:
     return ""
 
 
-def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
+def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:
     model_results = defaultdict(list)
     results_dir = Path(filename).parent
     with open(filename, newline="") as csvfile:
@@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
             model = row["Model"]
             test_dir = results_dir / model / row["Test"]
 
-            result = ExecResult(
+            result = EvalResult(
                 name=row["Test"],
                 status="success" if row["Passed"] == "true" else "error",
                 results=list(_read_case_results(test_dir / "cases.csv")),
@@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
     return dict(model_results)
 
 
-def write_results(model_results: dict[str, list[ExecResult]]):
+def write_results(model_results: dict[str, list[EvalResult]]):
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     # get current commit hash and dirty status, like: a8b2ef0-dirty
     # TODO: don't assume we are in the gptme repo, use other version identifiers if available

diff --git a/gptme/eval/run.py b/gptme/eval/run.py
@@ -19,8 +19,8 @@
 from .execenv import SimpleExecutionEnv
 from .types import (
     CaseResult,
-    ExecResult,
-    ExecTest,
+    EvalResult,
+    EvalSpec,
     ResultContext,
     Status,
 )
@@ -52,8 +52,8 @@ class SyncedDict(TypedDict):
 
 
 def run_evals(
-    tests: list[ExecTest], models: list[str], timeout: int, parallel: int
-) -> dict[str, list[ExecResult]]:
+    evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
+) -> dict[str, list[EvalResult]]:
     """
     Run evals for a list of tests.
     """
@@ -67,14 +67,14 @@ def run_evals(
     else:
         cleanup_on_sigterm()
 
-    n_runs = len(tests) * len(models)
-    model_results: dict[str, dict[str, ExecResult]] = defaultdict(dict)
+    n_runs = len(evals) * len(models)
+    model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
     parallel = min(n_runs, parallel)
     with ProcessPoolExecutor(parallel) as executor:
         futures = []
         future_to_model_test = {}
         for model in models:
-            for test in tests:
+            for test in evals:
                 future = executor.submit(
                     execute,
                     test,
@@ -103,7 +103,7 @@ def _handle_future(future: Future):
                     logger.exception(
                         f"Test {test_name} for model {model} generated an exception when trying to get result"
                     )
-                result = ExecResult(
+                result = EvalResult(
                     name=test_name,
                     status=status,
                     results=[],
@@ -116,7 +116,7 @@ def _handle_future(future: Future):
             model_results[model][test_name] = result
 
         # worse-case run time, with some buffer to account for overhead
-        max_timeout = timeout * len(tests) / parallel + 10
+        max_timeout = timeout * len(evals) / parallel + 10
         completed = set()
         try:
             # TODO: can we do better than this? handle timeouts within futures instead?
@@ -147,19 +147,19 @@ def _handle_future(future: Future):
         process.terminate()
         process.join()
 
-    model_results_final: dict[str, list[ExecResult]] = defaultdict(list)
+    model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
     for model in model_results:
         # sort results by test order
         model_results_final[model] = sorted(
             model_results[model].values(),
-            key=lambda result: [test["name"] for test in tests].index(result.name),
+            key=lambda result: [test["name"] for test in evals].index(result.name),
         )
 
     return model_results_final
 
 
 # TODO: rewrite to run in Docker? Would help with capturing output + process management.
-def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecResult:
+def execute(test: EvalSpec, agent: Agent, timeout: int, parallel: bool) -> EvalResult:
     """
     Executes the code for a specific model with a timeout.
     """
@@ -206,7 +206,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
             gen_stderr = result.get("stderr", "")
         else:
             logger.error("No result in shared dictionary")
-            return ExecResult(
+            return EvalResult(
                 name=test["name"],
                 status="error",
                 results=[],
@@ -256,7 +256,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
             results = []
             stdout_run, stderr_run = "", ""
 
-        return ExecResult(
+        return EvalResult(
             name=test["name"],
             status=status,
             results=results,

diff --git a/gptme/eval/suites/__init__.py b/gptme/eval/suites/__init__.py
@@ -1,16 +1,16 @@
-from ..types import ExecTest
+from ..types import EvalSpec
 from .basic import tests as tests_basic
 from .browser import tests as tests_browser
 from .init_projects import tests as tests_init_projects
 
-suites: dict[str, list[ExecTest]] = {
+suites: dict[str, list[EvalSpec]] = {
     "basic": tests_basic,
     "init_projects": tests_init_projects,
     "browser": tests_browser,
 }
 
-tests: list[ExecTest] = [test for suite in suites.values() for test in suite]
-tests_map: dict[str, ExecTest] = {test["name"]: test for test in tests}
+tests: list[EvalSpec] = [test for suite in suites.values() for test in suite]
+tests_map: dict[str, EvalSpec] = {test["name"]: test for test in tests}
 
 tests_default_ids: list[str] = [
     "hello",
@@ -19,4 +19,4 @@
     "prime100",
     "init-git",
 ]
-tests_default: list[ExecTest] = [tests_map[test_id] for test_id in tests_default_ids]
+tests_default: list[EvalSpec] = [tests_map[test_id] for test_id in tests_default_ids]
diff --git a/gptme/eval/suites/basic.py b/gptme/eval/suites/basic.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def correct_output_hello_world(ctx):
@@ -28,30 +28,30 @@ def check_output_hello_ask(ctx):
     return "Hello, Erik!" in ctx.stdout
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "hello",
         "files": {},
         "run": "python hello.py",
-        "prompt": "write a script hello.py which prints 'Hello, world!'",
+        "prompt": 'write a script hello.py which prints "Hello, world!"',
         "expect": {
             "correct output": correct_output_hello_world,
             "correct file": check_exists_hello,
         },
     },
     {
         "name": "hello-patch",
-        "files": {"hello.py": "print('Hello, world!')"},
+        "files": {"hello.py": 'print("Hello, world!")'},
         "run": "python hello.py",
-        "prompt": "Patch the code in hello.py to print 'Hello, human!'",
+        "prompt": 'Patch the code in hello.py to print "Hello, human!"',
         "expect": {
             "correct output": correct_output_hello_human,
             "correct file": check_exists_hello,
         },
     },
     {
         "name": "hello-ask",
-        "files": {"hello.py": "print('Hello, world!')"},
+        "files": {"hello.py": 'print("Hello, world!")'},
         "run": "echo 'Erik' | python hello.py",
         # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
         "prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",

diff --git a/gptme/eval/suites/browser.py b/gptme/eval/suites/browser.py
@@ -1,14 +1,14 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def check_output_erik(ctx):
     return "Erik" in ctx.stdout
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "whois-superuserlabs-ceo",
         "files": {},

diff --git a/gptme/eval/suites/init_projects.py b/gptme/eval/suites/init_projects.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def check_clean_exit(ctx):
@@ -41,7 +41,7 @@ def check_exists_main(ctx):
     return "main.py" in ctx.files
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "init-git",
         "files": {},

diff --git a/gptme/eval/types.py b/gptme/eval/types.py
@@ -31,9 +31,9 @@ class CaseResult:
 
 
 @dataclass
-class ExecResult:
+class EvalResult:
     """
-    Result of executing a prompt.
+    Result of executing an eval.
     """
 
     name: str
@@ -46,9 +46,9 @@ class ExecResult:
     run_stderr: str
 
 
-class ExecTest(TypedDict):
+class EvalSpec(TypedDict):
     """
-    Test case for executing a prompt.
+    Specification for an eval/test case.
     """
 
     name: str

diff --git a/gptme/llm.py b/gptme/llm.py
@@ -92,10 +92,11 @@ def print_clear():
             # need to flush stdout to get the print to show up
             sys.stdout.flush()
 
+            # TODO: make this more robust/general, maybe with a callback that runs on each char/chunk
             # pause inference on finished code-block, letting user run the command before continuing
             tooluses = list(ToolUse.iter_from_content(output))
-            if tooluses:
-                logger.debug("Found tool use, breaking")
+            if tooluses and any(tooluse.is_runnable for tooluse in tooluses):
+                logger.warning("Found tool use, breaking")
                 break
     except KeyboardInterrupt:
         return Message("assistant", output + "... ^C Interrupted")