Skip to content

Commit

Permalink
fix(eval): fixed bug when writing timeout results, improved gptme-eva…
Browse files Browse the repository at this point in the history
…l defaults (only 3 models + 5 evals)
  • Loading branch information
ErikBjare committed Aug 28, 2024
1 parent 9c13a60 commit 707d98c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.eval
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ RUN mkdir ./eval_results; chown appuser:appuser ./eval_results
USER appuser

# Add an entry point for running evals
ENTRYPOINT ["poetry", "run", "gptme-eval"]
ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"]
49 changes: 20 additions & 29 deletions gptme/eval/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,20 @@
from main import ExecTest


def correct_output_hello(ctx):
def correct_output_hello_world(ctx):
return ctx.stdout == "Hello, world!\n"


def correct_output_hello_human(ctx):
return ctx.stdout == "Hello, human!\n"


def correct_file_hello(ctx):
return ctx.files["hello.py"].strip() == "print('Hello, human!')"
def check_exists_hello(ctx):
return "hello.py" in ctx.files


def check_exists_main(ctx):
return "main.py" in ctx.files


def check_prime_output(ctx):
Expand All @@ -24,10 +32,6 @@ def check_clean_working_tree(ctx):
return "nothing to commit, working tree clean" in ctx.stdout


def check_main_py_exists(ctx):
return "main.py" in ctx.files


def check_commit_exists(ctx):
return "No commits yet" not in ctx.stdout

Expand Down Expand Up @@ -55,12 +59,12 @@ def check_cargo_toml(ctx):
tests: list["ExecTest"] = [
{
"name": "hello",
"files": {"hello.py": "print('Hello, world!')"},
"files": {},
"run": "python hello.py",
"prompt": "Change the code in hello.py to print 'Hello, human!'",
"prompt": "write a script hello.py which prints 'Hello, world!'",
"expect": {
"correct output": correct_output_hello,
"correct file": correct_file_hello,
"correct output": correct_output_hello_world,
"correct file": check_exists_hello,
},
},
{
Expand All @@ -69,8 +73,8 @@ def check_cargo_toml(ctx):
"run": "python hello.py",
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": correct_output_hello,
"correct file": correct_file_hello,
"correct output": correct_output_hello_human,
"correct file": check_exists_hello,
},
},
{
Expand Down Expand Up @@ -100,7 +104,7 @@ def check_cargo_toml(ctx):
"expect": {
"clean exit": check_clean_exit,
"clean working tree": check_clean_working_tree,
"main.py exists": check_main_py_exists,
"main.py exists": check_exists_main,
"we have a commit": check_commit_exists,
},
},
Expand Down Expand Up @@ -132,21 +136,8 @@ def check_cargo_toml(ctx):
"correct output": check_output_erik,
},
},
# Fails, gets stuck on interactive stuff
# {
# "name": "init-vue-ts-tailwind",
# "files": {},
# "run": "cat package.json",
# "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. avoid interactive tools to initialize the project",
# "expect": {
# "package.json exists": lambda ctx: "package.json" in ctx.files,
# "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
# "tailwind installed": lambda ctx: '"tailwindcss":'
# in ctx.files["package.json"],
# "typescript installed": lambda ctx: '"typescript":'
# in ctx.files["package.json"],
# },
# },
]

default_test_ids = ["hello", "hello-patch", "hello-ask", "prime100", "init-git"]
tests_map = {test["name"]: test for test in tests}
tests_default = [tests_map[test_id] for test_id in default_test_ids]
31 changes: 18 additions & 13 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from tabulate import tabulate

from .agents import Agent, GPTMe
from .evals import tests, tests_map
from .evals import tests_default, tests_map
from .execenv import SimpleExecutionEnv
from .types import (
CaseResult,
Expand Down Expand Up @@ -324,7 +324,7 @@ def print_model_results_table(model_results: dict[str, list[ExecResult]]):
multiple=True,
help="Model to use, can be massed multiple times.",
)
@click.option("--timeout", "-t", default=15, help="Timeout for code generation")
@click.option("--timeout", "-t", default=30, help="Timeout for code generation")
@click.option("--parallel", "-p", default=10, help="Number of parallel evals to run")
def main(
eval_names_or_result_files: list[str],
Expand All @@ -339,17 +339,17 @@ def main(
"""
models = _model or [
"openai/gpt-4o",
"openai/gpt-4o-mini",
# "openai/gpt-4o-mini",
"anthropic/claude-3-5-sonnet-20240620",
"openrouter/meta-llama/llama-3.1-8b-instruct",
"openrouter/meta-llama/llama-3.1-70b-instruct",
# "openrouter/meta-llama/llama-3.1-8b-instruct",
# "openrouter/meta-llama/llama-3.1-70b-instruct",
"openrouter/meta-llama/llama-3.1-405b-instruct",
"openrouter/nousresearch/hermes-3-llama-3.1-405b",
"openrouter/microsoft/wizardlm-2-8x22b",
"openrouter/mistralai/mistral-nemo",
"openrouter/mistralai/codestral-mamba",
"openrouter/mistralai/mixtral-8x22b-instruct",
"openrouter/deepseek/deepseek-coder",
# "openrouter/nousresearch/hermes-3-llama-3.1-405b",
# "openrouter/microsoft/wizardlm-2-8x22b",
# "openrouter/mistralai/mistral-nemo",
# "openrouter/mistralai/codestral-mamba",
# "openrouter/mistralai/mixtral-8x22b-instruct",
# "openrouter/deepseek/deepseek-coder",
]

results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
Expand All @@ -368,7 +368,7 @@ def main(
if test_name not in results_files
]
if eval_names_or_result_files
else tests
else tests_default
)
if not tests_to_run:
sys.exit(0)
Expand Down Expand Up @@ -439,7 +439,12 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
writer.writeheader()
for model, results in model_results.items():
for result in results:
passed = all(case["passed"] for case in result["results"])
# Needs to pass all checks, and needs to have results (not empty, as in case of timeout)
passed = (
all(case["passed"] for case in result["results"])
if result["results"]
else False
)
writer.writerow(
{
"Model": model,
Expand Down

0 comments on commit 707d98c

Please sign in to comment.