Skip to content

Commit

Permalink
feat: add support for different tool formats in eval system (#380)
Browse files Browse the repository at this point in the history
* feat: add support for different tool formats in eval system

- Add tool_format parameter to Agent class
- Support specifying tool format in CLI (--tool-format)
- Allow tool format to be specified per model with @Format suffix
- Run evals with multiple tool formats per model

* fix(eval): fixed tool format parametrization
  • Loading branch information
ErikBjare authored Jan 5, 2025
1 parent d4407e3 commit 5fbc0ac
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 10 deletions.
8 changes: 6 additions & 2 deletions gptme/eval/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@
from gptme.dirs import get_logs_dir
from gptme.tools import init_tools

from ..tools import ToolFormat
from .filestore import FileStore
from .types import Files

logger = logging.getLogger(__name__)


class Agent:
def __init__(self, model: str):
def __init__(self, model: str, tool_format: ToolFormat = "markdown"):
self.model = model
self.tool_format = tool_format

@abstractmethod
def act(self, files: Files | None, prompt: str) -> Files:
Expand All @@ -29,7 +31,8 @@ def act(self, files: Files | None, prompt: str) -> Files:
class GPTMe(Agent):
def act(self, files: Files | None, prompt: str):
_id = abs(hash(prompt)) % 1000000
name = get_name(f"gptme-evals-{self.model.replace('/', '--')}-{_id}")
model_fmt = f"{self.model.replace('/', '--')}-{self.tool_format}"
name = get_name(f"gptme-evals-{model_fmt}-{_id}")
log_dir = get_logs_dir() / name
workspace_dir = log_dir / "workspace"
if workspace_dir.exists():
Expand Down Expand Up @@ -60,6 +63,7 @@ def act(self, files: Files | None, prompt: str):
no_confirm=True,
interactive=False,
workspace=workspace_dir,
tool_format=self.tool_format,
)
# don't exit on sys.exit()
except (SystemExit, KeyboardInterrupt):
Expand Down
34 changes: 31 additions & 3 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
from collections.abc import Generator
from datetime import datetime, timezone
from pathlib import Path
from typing import cast, get_args

import click
import multiprocessing_logging
from tabulate import tabulate

from ..message import len_tokens
from ..tools import ToolFormat
from .run import run_evals
from .suites import suites, tests_default, tests_map
from .types import CaseResult, EvalResult, EvalSpec
Expand Down Expand Up @@ -180,15 +182,21 @@ def get_status_emoji(passed, total):
"--model",
"-m",
multiple=True,
help="Model to use, can be passed multiple times.",
help="Model to use, can be passed multiple times. Can include tool format with @, e.g. 'gpt-4@tool'",
)
@click.option("--timeout", "-t", default=30, help="Timeout for code generation")
@click.option("--parallel", "-p", default=10, help="Number of parallel evals to run")
@click.option(
"--tool-format",
type=click.Choice(get_args(ToolFormat)),
help="Tool format to use. Can also be specified per model with @format.",
)
def main(
eval_names_or_result_files: list[str],
_model: list[str],
timeout: int,
parallel: int,
tool_format: ToolFormat | None = None,
):
"""
Run evals for gptme.
Expand All @@ -199,7 +207,8 @@ def main(
# init
multiprocessing_logging.install_mp_handler()

models = _model or [
# Generate model+format combinations
default_models = [
"openai/gpt-4o",
"openai/gpt-4o-mini",
"anthropic/claude-3-5-sonnet-20241022",
Expand All @@ -208,6 +217,25 @@ def main(
"gemini/gemini-1.5-flash-latest",
]

def parse_format(fmt: str) -> ToolFormat:
if fmt not in get_args(ToolFormat):
raise ValueError(f"Invalid tool format: {fmt}")
return cast(ToolFormat, fmt)

# Process model specifications
model_configs: list[tuple[str, ToolFormat]] = []
for model_spec in _model or default_models:
if "@" in model_spec:
model, fmt = model_spec.split("@", 1)
model_configs.append((model, parse_format(fmt)))
else:
# If no format specified for model, use either provided default or test all formats
formats: list[ToolFormat] = (
[cast(ToolFormat, tool_format)] if tool_format else ["markdown", "tool"]
)
for fmt in formats:
model_configs.append((model_spec, fmt))

results_files = []
for f in eval_names_or_result_files:
p = Path(f)
Expand Down Expand Up @@ -238,7 +266,7 @@ def main(
evals_to_run = tests_default

print("=== Running evals ===")
model_results = run_evals(evals_to_run, models, timeout, parallel)
model_results = run_evals(evals_to_run, model_configs, timeout, parallel)
print("=== Finished ===")

print("\n=== Model Results ===")
Expand Down
21 changes: 16 additions & 5 deletions gptme/eval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from tqdm import tqdm

from ..tools import ToolFormat
from .agents import Agent, GPTMe
from .execenv import SimpleExecutionEnv
from .types import (
Expand Down Expand Up @@ -50,10 +51,19 @@ class SyncedDict(TypedDict):


def run_evals(
evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
evals: list[EvalSpec],
model_configs: list[tuple[str, ToolFormat]], # (model, tool_format)
timeout: int,
parallel: int,
) -> dict[str, list[EvalResult]]:
"""
Run evals for a list of tests.
Args:
evals: List of evaluation specifications
model_configs: List of (model, tool_format) tuples
timeout: Timeout in seconds for each eval
parallel: Number of parallel evaluations to run
"""
# For coverage to work with multiprocessing
# https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html
Expand All @@ -65,23 +75,24 @@ def run_evals(
else:
cleanup_on_sigterm()

n_runs = len(evals) * len(models)
n_runs = len(evals) * len(model_configs)
model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
parallel = min(n_runs, parallel)
with ProcessPoolExecutor(parallel) as executor:
futures = []
future_to_model_test = {}
for model in models:
for model, tool_format in model_configs:
model_id = f"{model}@{tool_format}"
for test in evals:
future = executor.submit(
execute,
test,
GPTMe(model=model),
GPTMe(model=model, tool_format=tool_format),
timeout,
parallel > 1,
)
futures.append(future)
future_to_model_test[future] = (model, test)
future_to_model_test[future] = (model_id, test)

def _handle_future(future: Future):
model, test = future_to_model_test[future]
Expand Down

0 comments on commit 5fbc0ac

Please sign in to comment.