From e0c79a41a6475190998ae9b4d9c4a378c68124b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= <erik@bjareho.lt>
Date: Thu, 19 Sep 2024 13:02:49 +0200
Subject: [PATCH] fix: minor eval refactor (improved type names), clarified
 python tool instructions

---
 Makefile                           |  2 +-
 gptme/eval/main.py                 | 38 ++++++++++++-----------
 gptme/eval/run.py                  | 28 ++++++++---------
 gptme/eval/suites/__init__.py      | 10 +++---
 gptme/eval/suites/basic.py         | 12 ++++----
 gptme/eval/suites/browser.py       |  4 +--
 gptme/eval/suites/init_projects.py |  4 +--
 gptme/eval/types.py                |  8 ++---
 gptme/llm.py                       |  5 +--
 gptme/tools/python.py              | 49 +++++++++++++++++++-----------
 gptme/tools/save.py                | 21 ++++++++-----
 gptme/tools/subagent.py            |  4 +--
 tests/test_cli.py                  |  3 +-
 tests/test_eval.py                 |  4 ++-
 14 files changed, 108 insertions(+), 84 deletions(-)

diff --git a/Makefile b/Makefile
index f7c6c94e..a1230f2d 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ test:
 	@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
 	poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
 		--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html --junitxml=junit.xml \
-		-n 8 \
+		-n 16 \
 		$(if $(EVAL), , -m "not eval") \
 		$(if $(SLOW), --timeout 60 --retries 2 --retry-delay 5, --timeout 5 -m "not slow and not eval") \
 		$(if $(PROFILE), --profile-svg)
diff --git a/gptme/eval/main.py b/gptme/eval/main.py
index 8f94b00d..10b1e1e9 100644
--- a/gptme/eval/main.py
+++ b/gptme/eval/main.py
@@ -20,7 +20,7 @@
 from ..message import len_tokens
 from .run import run_evals
 from .suites import suites, tests_default, tests_map
-from .types import CaseResult, ExecResult, ExecTest
+from .types import CaseResult, EvalResult, EvalSpec
 
 # Configure logging, including fully-qualified module names
 logging.basicConfig(
@@ -34,7 +34,7 @@
 project_dir = Path(__file__).parent.parent.parent
 
 
-def print_model_results(model_results: dict[str, list[ExecResult]]):
+def print_model_results(model_results: dict[str, list[EvalResult]]):
     total_tests = 0
     total_tokens = 0
 
@@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]):
     print(f"Completed {total_tests} tests in {total_tokens}tok")
 
 
-def print_model_results_table(model_results: dict[str, list[ExecResult]]):
+def print_model_results_table(model_results: dict[str, list[EvalResult]]):
     test_names = {
         result.name for results in model_results.values() for result in results
     }
@@ -120,19 +120,23 @@ def main(
 ):
     """
     Run evals for gptme.
+    Pass eval or suite names to run, or result files to print.
 
-    Pass test names to run, or result files to print.
+    Output from evals will be captured, unless a single eval is run, and saved to the results directory.
     """
     # init
     multiprocessing_logging.install_mp_handler()
 
     models = _model or [
         "openai/gpt-4o",
+        "openai/gpt-4o-mini",
         "anthropic/claude-3-5-sonnet-20240620",
+        "anthropic/claude-3-haiku-20240307",
         "openrouter/meta-llama/llama-3.1-405b-instruct",
     ]
 
     results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
+    eval_names = [f for f in eval_names_or_result_files if f not in results_files]
     if results_files:
         for results_file in results_files:
             p = Path(results_file)
@@ -148,20 +152,20 @@ def main(
                 sys.exit(1)
         sys.exit(0)
 
-    tests_to_run: list[ExecTest] = []
-    for test_name in eval_names_or_result_files:
-        if test_name in tests_map:
-            tests_to_run.append(tests_map[test_name])
-        elif test_name in suites:
-            tests_to_run.extend(suites[test_name])
+    evals_to_run: list[EvalSpec] = []
+    for eval_name in eval_names:
+        if test := tests_map.get(eval_name):
+            evals_to_run.append(test)
+        elif suite := suites.get(eval_name) or suites.get(eval_name.replace("-", "_")):
+            evals_to_run.extend(suite)
         else:
-            raise ValueError(f"Test {test_name} not found")
+            raise ValueError(f"Test {eval_name} not found")
 
-    if not tests_to_run:
-        tests_to_run = tests_default
+    if not evals_to_run:
+        evals_to_run = tests_default
 
     print("=== Running evals ===")
-    model_results = run_evals(tests_to_run, models, timeout, parallel)
+    model_results = run_evals(evals_to_run, models, timeout, parallel)
     print("\n=== Finished ===\n")
 
     print("\n=== Model Results ===")
@@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str:
     return ""
 
 
-def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
+def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:
     model_results = defaultdict(list)
     results_dir = Path(filename).parent
     with open(filename, newline="") as csvfile:
@@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
             model = row["Model"]
             test_dir = results_dir / model / row["Test"]
 
-            result = ExecResult(
+            result = EvalResult(
                 name=row["Test"],
                 status="success" if row["Passed"] == "true" else "error",
                 results=list(_read_case_results(test_dir / "cases.csv")),
@@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
     return dict(model_results)
 
 
-def write_results(model_results: dict[str, list[ExecResult]]):
+def write_results(model_results: dict[str, list[EvalResult]]):
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     # get current commit hash and dirty status, like: a8b2ef0-dirty
     # TODO: don't assume we are in the gptme repo, use other version identifiers if available
diff --git a/gptme/eval/run.py b/gptme/eval/run.py
index 30f8e24c..200c4961 100644
--- a/gptme/eval/run.py
+++ b/gptme/eval/run.py
@@ -19,8 +19,8 @@
 from .execenv import SimpleExecutionEnv
 from .types import (
     CaseResult,
-    ExecResult,
-    ExecTest,
+    EvalResult,
+    EvalSpec,
     ResultContext,
     Status,
 )
@@ -52,8 +52,8 @@ class SyncedDict(TypedDict):
 
 
 def run_evals(
-    tests: list[ExecTest], models: list[str], timeout: int, parallel: int
-) -> dict[str, list[ExecResult]]:
+    evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
+) -> dict[str, list[EvalResult]]:
     """
     Run evals for a list of tests.
     """
@@ -67,14 +67,14 @@ def run_evals(
     else:
         cleanup_on_sigterm()
 
-    n_runs = len(tests) * len(models)
-    model_results: dict[str, dict[str, ExecResult]] = defaultdict(dict)
+    n_runs = len(evals) * len(models)
+    model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
     parallel = min(n_runs, parallel)
     with ProcessPoolExecutor(parallel) as executor:
         futures = []
         future_to_model_test = {}
         for model in models:
-            for test in tests:
+            for test in evals:
                 future = executor.submit(
                     execute,
                     test,
@@ -103,7 +103,7 @@ def _handle_future(future: Future):
                     logger.exception(
                         f"Test {test_name} for model {model} generated an exception when trying to get result"
                     )
-                result = ExecResult(
+                result = EvalResult(
                     name=test_name,
                     status=status,
                     results=[],
@@ -116,7 +116,7 @@ def _handle_future(future: Future):
             model_results[model][test_name] = result
 
         # worse-case run time, with some buffer to account for overhead
-        max_timeout = timeout * len(tests) / parallel + 10
+        max_timeout = timeout * len(evals) / parallel + 10
         completed = set()
         try:
             # TODO: can we do better than this? handle timeouts within futures instead?
@@ -147,19 +147,19 @@ def _handle_future(future: Future):
         process.terminate()
         process.join()
 
-    model_results_final: dict[str, list[ExecResult]] = defaultdict(list)
+    model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
     for model in model_results:
         # sort results by test order
         model_results_final[model] = sorted(
             model_results[model].values(),
-            key=lambda result: [test["name"] for test in tests].index(result.name),
+            key=lambda result: [test["name"] for test in evals].index(result.name),
         )
 
     return model_results_final
 
 
 # TODO: rewrite to run in Docker? Would help with capturing output + process management.
-def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecResult:
+def execute(test: EvalSpec, agent: Agent, timeout: int, parallel: bool) -> EvalResult:
     """
     Executes the code for a specific model with a timeout.
     """
@@ -206,7 +206,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
             gen_stderr = result.get("stderr", "")
         else:
             logger.error("No result in shared dictionary")
-            return ExecResult(
+            return EvalResult(
                 name=test["name"],
                 status="error",
                 results=[],
@@ -256,7 +256,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
             results = []
             stdout_run, stderr_run = "", ""
 
-        return ExecResult(
+        return EvalResult(
             name=test["name"],
             status=status,
             results=results,
diff --git a/gptme/eval/suites/__init__.py b/gptme/eval/suites/__init__.py
index 4b34e8a1..a3ce9896 100644
--- a/gptme/eval/suites/__init__.py
+++ b/gptme/eval/suites/__init__.py
@@ -1,16 +1,16 @@
-from ..types import ExecTest
+from ..types import EvalSpec
 from .basic import tests as tests_basic
 from .browser import tests as tests_browser
 from .init_projects import tests as tests_init_projects
 
-suites: dict[str, list[ExecTest]] = {
+suites: dict[str, list[EvalSpec]] = {
     "basic": tests_basic,
     "init_projects": tests_init_projects,
     "browser": tests_browser,
 }
 
-tests: list[ExecTest] = [test for suite in suites.values() for test in suite]
-tests_map: dict[str, ExecTest] = {test["name"]: test for test in tests}
+tests: list[EvalSpec] = [test for suite in suites.values() for test in suite]
+tests_map: dict[str, EvalSpec] = {test["name"]: test for test in tests}
 
 tests_default_ids: list[str] = [
     "hello",
@@ -19,4 +19,4 @@
     "prime100",
     "init-git",
 ]
-tests_default: list[ExecTest] = [tests_map[test_id] for test_id in tests_default_ids]
+tests_default: list[EvalSpec] = [tests_map[test_id] for test_id in tests_default_ids]
diff --git a/gptme/eval/suites/basic.py b/gptme/eval/suites/basic.py
index ae6c4538..5c7920d2 100644
--- a/gptme/eval/suites/basic.py
+++ b/gptme/eval/suites/basic.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def correct_output_hello_world(ctx):
@@ -28,12 +28,12 @@ def check_output_hello_ask(ctx):
     return "Hello, Erik!" in ctx.stdout
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "hello",
         "files": {},
         "run": "python hello.py",
-        "prompt": "write a script hello.py which prints 'Hello, world!'",
+        "prompt": 'write a script hello.py which prints "Hello, world!"',
         "expect": {
             "correct output": correct_output_hello_world,
             "correct file": check_exists_hello,
@@ -41,9 +41,9 @@ def check_output_hello_ask(ctx):
     },
     {
         "name": "hello-patch",
-        "files": {"hello.py": "print('Hello, world!')"},
+        "files": {"hello.py": 'print("Hello, world!")'},
         "run": "python hello.py",
-        "prompt": "Patch the code in hello.py to print 'Hello, human!'",
+        "prompt": 'Patch the code in hello.py to print "Hello, human!"',
         "expect": {
             "correct output": correct_output_hello_human,
             "correct file": check_exists_hello,
@@ -51,7 +51,7 @@ def check_output_hello_ask(ctx):
     },
     {
         "name": "hello-ask",
-        "files": {"hello.py": "print('Hello, world!')"},
+        "files": {"hello.py": 'print("Hello, world!")'},
         "run": "echo 'Erik' | python hello.py",
         # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
         "prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
diff --git a/gptme/eval/suites/browser.py b/gptme/eval/suites/browser.py
index e3bbb17e..8721c3cb 100644
--- a/gptme/eval/suites/browser.py
+++ b/gptme/eval/suites/browser.py
@@ -1,14 +1,14 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def check_output_erik(ctx):
     return "Erik" in ctx.stdout
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "whois-superuserlabs-ceo",
         "files": {},
diff --git a/gptme/eval/suites/init_projects.py b/gptme/eval/suites/init_projects.py
index 80c9773a..bf7a921d 100644
--- a/gptme/eval/suites/init_projects.py
+++ b/gptme/eval/suites/init_projects.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from gptme.eval.main import ExecTest
+    from gptme.eval.main import EvalSpec
 
 
 def check_clean_exit(ctx):
@@ -41,7 +41,7 @@ def check_exists_main(ctx):
     return "main.py" in ctx.files
 
 
-tests: list["ExecTest"] = [
+tests: list["EvalSpec"] = [
     {
         "name": "init-git",
         "files": {},
diff --git a/gptme/eval/types.py b/gptme/eval/types.py
index 0ced4800..a9395537 100644
--- a/gptme/eval/types.py
+++ b/gptme/eval/types.py
@@ -31,9 +31,9 @@ class CaseResult:
 
 
 @dataclass
-class ExecResult:
+class EvalResult:
     """
-    Result of executing a prompt.
+    Result of executing an eval.
     """
 
     name: str
@@ -46,9 +46,9 @@ class ExecResult:
     run_stderr: str
 
 
-class ExecTest(TypedDict):
+class EvalSpec(TypedDict):
     """
-    Test case for executing a prompt.
+    Specification for an eval/test case.
     """
 
     name: str
diff --git a/gptme/llm.py b/gptme/llm.py
index 784eb931..e0d4ef5d 100644
--- a/gptme/llm.py
+++ b/gptme/llm.py
@@ -92,10 +92,11 @@ def print_clear():
             # need to flush stdout to get the print to show up
             sys.stdout.flush()
 
+            # TODO: make this more robust/general, maybe with a callback that runs on each char/chunk
             # pause inference on finished code-block, letting user run the command before continuing
             tooluses = list(ToolUse.iter_from_content(output))
-            if tooluses:
-                logger.debug("Found tool use, breaking")
+            if tooluses and any(tooluse.is_runnable for tooluse in tooluses):
+                logger.warning("Found tool use, breaking")
                 break
     except KeyboardInterrupt:
         return Message("assistant", output + "... ^C Interrupted")
diff --git a/gptme/tools/python.py b/gptme/tools/python.py
index 8236ac61..f8cdb4ea 100644
--- a/gptme/tools/python.py
+++ b/gptme/tools/python.py
@@ -151,32 +151,45 @@ def get_installed_python_libraries() -> set[str]:
     return installed
 
 
+instructions = """
+To execute Python code in an interactive IPython session, send a codeblock using the `ipython` language tag.
+It will respond with the output and result of the execution.
+Assistant may first write the code in a normal python codeblock, then execute it in an IPython codeblock.
+"""
+
+
 examples = f"""
 #### Results of the last expression will be displayed, IPython-style:
-User: What is 2 + 2?
-Assistant:
-{ToolUse("python", [], "2 + 2").to_output()}
-System: Executed code block.
-```stdout
+> User: What is 2 + 2?
+> Assistant:
+{ToolUse("ipython", [], "2 + 2").to_output()}
+> System: Executed code block.
+```result
 4
 ```
 
-#### The user can also run Python code with the /python command:
-
-User: /python 2 + 2
-System: Executed code block.
-```stdout
-4
+#### It can write an example and then execute it:
+> User: compute fib 10
+> Assistant: To compute the 10th Fibonacci number, we write a recursive function:
+```python
+def fib(n):
+    ...
+```
+Now, let's execute this code to get the 10th Fibonacci number:
+{ToolUse("ipython", [], '''
+def fib(n):
+    if n <= 1:
+        return n
+    return fib(n - 1) + fib(n - 2)
+fib(10)
+''').to_output()}
+> System: Executed code block.
+```result
+55
 ```
 """.strip()
 
 
-instructions = """
-When you send a message containing Python code (and is not a file block), it will be executed in a stateful environment.
-Python will respond with the output of the execution.
-"""
-
-
 # only used for doc generation, use get_tool() in the code
 tool = ToolSpec(
     name="python",
@@ -185,7 +198,7 @@ def get_installed_python_libraries() -> set[str]:
     examples=examples,
     execute=execute_python,
     block_types=[
-        "python",
+        # "python",
         "ipython",
         "py",
     ],
diff --git a/gptme/tools/save.py b/gptme/tools/save.py
index e9489641..3e6a7d66 100644
--- a/gptme/tools/save.py
+++ b/gptme/tools/save.py
@@ -1,5 +1,5 @@
 """
-Gives the assistant the ability to save code to a file.
+Gives the assistant the ability to save/write code to a file.
 """
 
 from collections.abc import Generator
@@ -9,13 +9,18 @@
 from ..util import ask_execute
 from .base import ToolSpec, ToolUse
 
+# FIXME: this is markdown-specific instructions, thus will confuse the XML mode
 instructions = """
-To save code to a file, use a code block with the filepath as the language.
+To write text to a file, use a code block with the language tag set to the path of the file.
 """.strip()
 
 examples = f"""
-> User: write a Hello world script to hello.py
-{ToolUse("save", ["hello.py"], "print('Hello world')").to_output()}
+> User: write a hello world script to hello.py
+{ToolUse("save", ["hello.py"], 'print("Hello world")').to_output()}
+> System: Saved to `hello.py`
+
+> User: make it all-caps
+{ToolUse("save", ["hello.py"], 'print("HELLO WORLD")').to_output()}
 > System: Saved to `hello.py`
 """.strip()
 
@@ -119,7 +124,7 @@ def execute_append(
 
 tool_save = ToolSpec(
     name="save",
-    desc="Save code to a file",
+    desc="Write text to file",
     instructions=instructions,
     examples=examples,
     execute=execute_save,
@@ -128,19 +133,19 @@ def execute_append(
 __doc__ = tool_save.get_doc(__doc__)
 
 instructions_append = """
-To append code to a file, use a code block with the language: append <filepath>
+To append text to a file, use a code block with the language: append <filepath>
 """.strip()
 
 examples_append = f"""
 > User: append a print "Hello world" to hello.py
 > Assistant:
-{ToolUse("append", ["hello.py"], "print('Hello world')").to_output()}
+{ToolUse("append", ["hello.py"], 'print("Hello world")').to_output()}
 > System: Appended to `hello.py`
 """.strip()
 
 tool_append = ToolSpec(
     name="append",
-    desc="Append code to a file",
+    desc="Append text to file",
     instructions=instructions_append,
     examples=examples_append,
     execute=execute_append,
diff --git a/gptme/tools/subagent.py b/gptme/tools/subagent.py
index f9f2b9c4..e0d2a4cd 100644
--- a/gptme/tools/subagent.py
+++ b/gptme/tools/subagent.py
@@ -155,10 +155,10 @@ def subagent_wait(agent_id: str) -> dict:
 examples = f"""
 User: compute fib 69 using a subagent
 Assistant: Starting a subagent to compute the 69th Fibonacci number.
-{ToolUse("python", [], 'subagent("compute the 69th Fibonacci number", "fib-69")').to_output()}
+{ToolUse("ipython", [], 'subagent("compute the 69th Fibonacci number", "fib-69")').to_output()}
 System: Subagent started successfully.
 Assistant: Now we need to wait for the subagent to finish the task.
-{ToolUse("python", [], 'subagent_wait("fib-69")').to_output()}
+{ToolUse("ipython", [], 'subagent_wait("fib-69")').to_output()}
 """
 
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index eccd3554..e38ac302 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -225,10 +225,9 @@ def test_block(args: list[str], lang: str, runner: CliRunner):
     assert result.exit_code == 0
 
 
-# TODO: these could be fast if we had a cache
 @pytest.mark.slow
 def test_generate_primes(args: list[str], runner: CliRunner):
-    args.append("print the first 10 prime numbers")
+    args.append("compute the first 10 prime numbers")
     result = runner.invoke(gptme.cli.main, args)
     # check that the 9th and 10th prime is present
     assert "23" in result.output
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 1b975c79..8fd59f9b 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -49,13 +49,15 @@ def test_eval(test):
     provider = _detect_model()
     agent = GPTMe(provider)
     result = execute(test, agent, timeout=30, parallel=False)
+    assert result.results
     assert all(case.passed for case in result.results)
 
 
 # Hook to generate tests from the tests list
 def pytest_generate_tests(metafunc):
     if "test" in metafunc.fixturenames:
-        allowlist = ["hello"]  # for now, only run the hello test
+        # for now, only run the hello-patch test (the "hello" test is unreliable with gpt-4o-mini)
+        allowlist = ["hello-patch"]
         test_set, test_names = zip(
             *[(test, test["name"]) for test in tests if test["name"] in allowlist]
         )