From 5fbc0acbcbcc20597d17b1c7a6ce91bf513c4df2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= <erik@bjareho.lt>
Date: Sun, 5 Jan 2025 13:23:27 +0100
Subject: [PATCH] feat: add support for different tool formats in eval system
 (#380)

* feat: add support for different tool formats in eval system

- Add tool_format parameter to Agent class
- Support specifying tool format in CLI (--tool-format)
- Allow tool format to be specified per model with @format suffix
- Run evals with multiple tool formats per model

* fix(eval): fixed tool format parametrization
---
 gptme/eval/agents.py |  8 ++++++--
 gptme/eval/main.py   | 34 +++++++++++++++++++++++++++++++---
 gptme/eval/run.py    | 21 ++++++++++++++++-----
 3 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/gptme/eval/agents.py b/gptme/eval/agents.py
index 22a6b6344..f6e185058 100644
--- a/gptme/eval/agents.py
+++ b/gptme/eval/agents.py
@@ -8,6 +8,7 @@
 from gptme.dirs import get_logs_dir
 from gptme.tools import init_tools
 
+from ..tools import ToolFormat
 from .filestore import FileStore
 from .types import Files
 
@@ -15,8 +16,9 @@
 
 
 class Agent:
-    def __init__(self, model: str):
+    def __init__(self, model: str, tool_format: ToolFormat = "markdown"):
         self.model = model
+        self.tool_format = tool_format
 
     @abstractmethod
     def act(self, files: Files | None, prompt: str) -> Files:
@@ -29,7 +31,8 @@ def act(self, files: Files | None, prompt: str) -> Files:
 class GPTMe(Agent):
     def act(self, files: Files | None, prompt: str):
         _id = abs(hash(prompt)) % 1000000
-        name = get_name(f"gptme-evals-{self.model.replace('/', '--')}-{_id}")
+        model_fmt = f"{self.model.replace('/', '--')}-{self.tool_format}"
+        name = get_name(f"gptme-evals-{model_fmt}-{_id}")
         log_dir = get_logs_dir() / name
         workspace_dir = log_dir / "workspace"
         if workspace_dir.exists():
@@ -60,6 +63,7 @@ def act(self, files: Files | None, prompt: str):
                 no_confirm=True,
                 interactive=False,
                 workspace=workspace_dir,
+                tool_format=self.tool_format,
             )
         # don't exit on sys.exit()
         except (SystemExit, KeyboardInterrupt):
diff --git a/gptme/eval/main.py b/gptme/eval/main.py
index 7af64f9c8..5eeabaecf 100644
--- a/gptme/eval/main.py
+++ b/gptme/eval/main.py
@@ -13,12 +13,14 @@
 from collections.abc import Generator
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import cast, get_args
 
 import click
 import multiprocessing_logging
 from tabulate import tabulate
 
 from ..message import len_tokens
+from ..tools import ToolFormat
 from .run import run_evals
 from .suites import suites, tests_default, tests_map
 from .types import CaseResult, EvalResult, EvalSpec
@@ -180,15 +182,21 @@ def get_status_emoji(passed, total):
     "--model",
     "-m",
     multiple=True,
-    help="Model to use, can be passed multiple times.",
+    help="Model to use, can be passed multiple times. Can include tool format with @, e.g. 'gpt-4@tool'",
 )
 @click.option("--timeout", "-t", default=30, help="Timeout for code generation")
 @click.option("--parallel", "-p", default=10, help="Number of parallel evals to run")
+@click.option(
+    "--tool-format",
+    type=click.Choice(get_args(ToolFormat)),
+    help="Tool format to use. Can also be specified per model with @format.",
+)
 def main(
     eval_names_or_result_files: list[str],
     _model: list[str],
     timeout: int,
     parallel: int,
+    tool_format: ToolFormat | None = None,
 ):
     """
     Run evals for gptme.
@@ -199,7 +207,8 @@ def main(
     # init
     multiprocessing_logging.install_mp_handler()
 
-    models = _model or [
+    # Generate model+format combinations
+    default_models = [
         "openai/gpt-4o",
         "openai/gpt-4o-mini",
         "anthropic/claude-3-5-sonnet-20241022",
@@ -208,6 +217,25 @@ def main(
         "gemini/gemini-1.5-flash-latest",
     ]
 
+    def parse_format(fmt: str) -> ToolFormat:
+        if fmt not in get_args(ToolFormat):
+            raise ValueError(f"Invalid tool format: {fmt}")
+        return cast(ToolFormat, fmt)
+
+    # Process model specifications
+    model_configs: list[tuple[str, ToolFormat]] = []
+    for model_spec in _model or default_models:
+        if "@" in model_spec:
+            model, fmt = model_spec.split("@", 1)
+            model_configs.append((model, parse_format(fmt)))
+        else:
+            # If no format specified for model, use either provided default or test all formats
+            formats: list[ToolFormat] = (
+                [cast(ToolFormat, tool_format)] if tool_format else ["markdown", "tool"]
+            )
+            for fmt in formats:
+                model_configs.append((model_spec, fmt))
+
     results_files = []
     for f in eval_names_or_result_files:
         p = Path(f)
@@ -238,7 +266,7 @@ def main(
         evals_to_run = tests_default
 
     print("=== Running evals ===")
-    model_results = run_evals(evals_to_run, models, timeout, parallel)
+    model_results = run_evals(evals_to_run, model_configs, timeout, parallel)
     print("=== Finished ===")
 
     print("\n=== Model Results ===")
diff --git a/gptme/eval/run.py b/gptme/eval/run.py
index 6eb0c6ccd..5b1fb66f5 100644
--- a/gptme/eval/run.py
+++ b/gptme/eval/run.py
@@ -13,6 +13,7 @@
 
 from tqdm import tqdm
 
+from ..tools import ToolFormat
 from .agents import Agent, GPTMe
 from .execenv import SimpleExecutionEnv
 from .types import (
@@ -50,10 +51,19 @@ class SyncedDict(TypedDict):
 
 
 def run_evals(
-    evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
+    evals: list[EvalSpec],
+    model_configs: list[tuple[str, ToolFormat]],  # (model, tool_format)
+    timeout: int,
+    parallel: int,
 ) -> dict[str, list[EvalResult]]:
     """
     Run evals for a list of tests.
+
+    Args:
+        evals: List of evaluation specifications
+        model_configs: List of (model, tool_format) tuples
+        timeout: Timeout in seconds for each eval
+        parallel: Number of parallel evaluations to run
     """
     # For coverage to work with multiprocessing
     # https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html
@@ -65,23 +75,24 @@ def run_evals(
     else:
         cleanup_on_sigterm()
 
-    n_runs = len(evals) * len(models)
+    n_runs = len(evals) * len(model_configs)
     model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
     parallel = min(n_runs, parallel)
     with ProcessPoolExecutor(parallel) as executor:
         futures = []
         future_to_model_test = {}
-        for model in models:
+        for model, tool_format in model_configs:
+            model_id = f"{model}@{tool_format}"
             for test in evals:
                 future = executor.submit(
                     execute,
                     test,
-                    GPTMe(model=model),
+                    GPTMe(model=model, tool_format=tool_format),
                     timeout,
                     parallel > 1,
                 )
                 futures.append(future)
-                future_to_model_test[future] = (model, test)
+                future_to_model_test[future] = (model_id, test)
 
         def _handle_future(future: Future):
             model, test = future_to_model_test[future]