From 5fbc0acbcbcc20597d17b1c7a6ce91bf513c4df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Sun, 5 Jan 2025 13:23:27 +0100 Subject: [PATCH] feat: add support for different tool formats in eval system (#380) * feat: add support for different tool formats in eval system - Add tool_format parameter to Agent class - Support specifying tool format in CLI (--tool-format) - Allow tool format to be specified per model with @format suffix - Run evals with multiple tool formats per model * fix(eval): fixed tool format parametrization --- gptme/eval/agents.py | 8 ++++++-- gptme/eval/main.py | 34 +++++++++++++++++++++++++++++++--- gptme/eval/run.py | 21 ++++++++++++++++----- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/gptme/eval/agents.py b/gptme/eval/agents.py index 22a6b6344..f6e185058 100644 --- a/gptme/eval/agents.py +++ b/gptme/eval/agents.py @@ -8,6 +8,7 @@ from gptme.dirs import get_logs_dir from gptme.tools import init_tools +from ..tools import ToolFormat from .filestore import FileStore from .types import Files @@ -15,8 +16,9 @@ class Agent: - def __init__(self, model: str): + def __init__(self, model: str, tool_format: ToolFormat = "markdown"): self.model = model + self.tool_format = tool_format @abstractmethod def act(self, files: Files | None, prompt: str) -> Files: @@ -29,7 +31,8 @@ def act(self, files: Files | None, prompt: str) -> Files: class GPTMe(Agent): def act(self, files: Files | None, prompt: str): _id = abs(hash(prompt)) % 1000000 - name = get_name(f"gptme-evals-{self.model.replace('/', '--')}-{_id}") + model_fmt = f"{self.model.replace('/', '--')}-{self.tool_format}" + name = get_name(f"gptme-evals-{model_fmt}-{_id}") log_dir = get_logs_dir() / name workspace_dir = log_dir / "workspace" if workspace_dir.exists(): @@ -60,6 +63,7 @@ def act(self, files: Files | None, prompt: str): no_confirm=True, interactive=False, workspace=workspace_dir, + tool_format=self.tool_format, ) # don't exit on sys.exit() except (SystemExit, KeyboardInterrupt): diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 7af64f9c8..5eeabaecf 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -13,12 +13,14 @@ from collections.abc import Generator from datetime import datetime, timezone from pathlib import Path +from typing import cast, get_args import click import multiprocessing_logging from tabulate import tabulate from ..message import len_tokens +from ..tools import ToolFormat from .run import run_evals from .suites import suites, tests_default, tests_map from .types import CaseResult, EvalResult, EvalSpec @@ -180,15 +182,21 @@ def get_status_emoji(passed, total): "--model", "-m", multiple=True, - help="Model to use, can be passed multiple times.", + help="Model to use, can be passed multiple times. Can include tool format with @, e.g. 'gpt-4@tool'", ) @click.option("--timeout", "-t", default=30, help="Timeout for code generation") @click.option("--parallel", "-p", default=10, help="Number of parallel evals to run") +@click.option( + "--tool-format", + type=click.Choice(get_args(ToolFormat)), + help="Tool format to use. Can also be specified per model with @format.", +) def main( eval_names_or_result_files: list[str], _model: list[str], timeout: int, parallel: int, + tool_format: ToolFormat | None = None, ): """ Run evals for gptme. @@ -199,7 +207,8 @@ def main( # init multiprocessing_logging.install_mp_handler() - models = _model or [ + # Generate model+format combinations + default_models = [ "openai/gpt-4o", "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20241022", @@ -208,6 +217,25 @@ def main( "gemini/gemini-1.5-flash-latest", ] + def parse_format(fmt: str) -> ToolFormat: + if fmt not in get_args(ToolFormat): + raise ValueError(f"Invalid tool format: {fmt}") + return cast(ToolFormat, fmt) + + # Process model specifications + model_configs: list[tuple[str, ToolFormat]] = [] + for model_spec in _model or default_models: + if "@" in model_spec: + model, fmt = model_spec.split("@", 1) + model_configs.append((model, parse_format(fmt))) + else: + # If no format specified for model, use either provided default or test all formats + formats: list[ToolFormat] = ( + [cast(ToolFormat, tool_format)] if tool_format else ["markdown", "tool"] + ) + for fmt in formats: + model_configs.append((model_spec, fmt)) + results_files = [] for f in eval_names_or_result_files: p = Path(f) @@ -238,7 +266,7 @@ def main( evals_to_run = tests_default print("=== Running evals ===") - model_results = run_evals(evals_to_run, models, timeout, parallel) + model_results = run_evals(evals_to_run, model_configs, timeout, parallel) print("=== Finished ===") print("\n=== Model Results ===") diff --git a/gptme/eval/run.py b/gptme/eval/run.py index 6eb0c6ccd..5b1fb66f5 100644 --- a/gptme/eval/run.py +++ b/gptme/eval/run.py @@ -13,6 +13,7 @@ from tqdm import tqdm +from ..tools import ToolFormat from .agents import Agent, GPTMe from .execenv import SimpleExecutionEnv from .types import ( @@ -50,10 +51,19 @@ class SyncedDict(TypedDict): def run_evals( - evals: list[EvalSpec], models: list[str], timeout: int, parallel: int + evals: list[EvalSpec], + model_configs: list[tuple[str, ToolFormat]], # (model, tool_format) + timeout: int, + parallel: int, ) -> dict[str, list[EvalResult]]: """ Run evals for a list of tests. + + Args: + evals: List of evaluation specifications + model_configs: List of (model, tool_format) tuples + timeout: Timeout in seconds for each eval + parallel: Number of parallel evaluations to run """ # For coverage to work with multiprocessing # https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html @@ -65,23 +75,24 @@ def run_evals( else: cleanup_on_sigterm() - n_runs = len(evals) * len(models) + n_runs = len(evals) * len(model_configs) model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict) parallel = min(n_runs, parallel) with ProcessPoolExecutor(parallel) as executor: futures = [] future_to_model_test = {} - for model in models: + for model, tool_format in model_configs: + model_id = f"{model}@{tool_format}" for test in evals: future = executor.submit( execute, test, - GPTMe(model=model), + GPTMe(model=model, tool_format=tool_format), timeout, parallel > 1, ) futures.append(future) - future_to_model_test[future] = (model, test) + future_to_model_test[future] = (model_id, test) def _handle_future(future: Future): model, test = future_to_model_test[future]