diff --git a/Dockerfile.eval b/Dockerfile.eval index fcfa4d5b..97e7ebb1 100644 --- a/Dockerfile.eval +++ b/Dockerfile.eval @@ -19,4 +19,4 @@ RUN mkdir ./eval_results; chown appuser:appuser ./eval_results USER appuser # Add an entry point for running evals -ENTRYPOINT ["poetry", "run", "gptme-eval"] +ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"] diff --git a/gptme/eval/evals.py b/gptme/eval/evals.py index 2b57e33d..69ea9e01 100644 --- a/gptme/eval/evals.py +++ b/gptme/eval/evals.py @@ -4,12 +4,20 @@ from main import ExecTest -def correct_output_hello(ctx): +def correct_output_hello_world(ctx): + return ctx.stdout == "Hello, world!\n" + + +def correct_output_hello_human(ctx): return ctx.stdout == "Hello, human!\n" -def correct_file_hello(ctx): - return ctx.files["hello.py"].strip() == "print('Hello, human!')" +def check_exists_hello(ctx): + return "hello.py" in ctx.files + + +def check_exists_main(ctx): + return "main.py" in ctx.files def check_prime_output(ctx): @@ -24,10 +32,6 @@ def check_clean_working_tree(ctx): return "nothing to commit, working tree clean" in ctx.stdout -def check_main_py_exists(ctx): - return "main.py" in ctx.files - - def check_commit_exists(ctx): return "No commits yet" not in ctx.stdout @@ -55,12 +59,12 @@ def check_cargo_toml(ctx): tests: list["ExecTest"] = [ { "name": "hello", - "files": {"hello.py": "print('Hello, world!')"}, + "files": {}, "run": "python hello.py", - "prompt": "Change the code in hello.py to print 'Hello, human!'", + "prompt": "write a script hello.py which prints 'Hello, world!'", "expect": { - "correct output": correct_output_hello, - "correct file": correct_file_hello, + "correct output": correct_output_hello_world, + "correct file": check_exists_hello, }, }, { @@ -69,8 +73,8 @@ def check_cargo_toml(ctx): "run": "python hello.py", "prompt": "Patch the code in hello.py to print 'Hello, human!'", "expect": { - "correct output": correct_output_hello, - "correct file": correct_file_hello, + "correct output": correct_output_hello_human, + "correct file": check_exists_hello, }, }, { @@ -100,7 +104,7 @@ def check_cargo_toml(ctx): "expect": { "clean exit": check_clean_exit, "clean working tree": check_clean_working_tree, - "main.py exists": check_main_py_exists, + "main.py exists": check_exists_main, "we have a commit": check_commit_exists, }, }, @@ -132,21 +136,8 @@ def check_cargo_toml(ctx): "correct output": check_output_erik, }, }, - # Fails, gets stuck on interactive stuff - # { - # "name": "init-vue-ts-tailwind", - # "files": {}, - # "run": "cat package.json", - # "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. avoid interactive tools to initialize the project", - # "expect": { - # "package.json exists": lambda ctx: "package.json" in ctx.files, - # "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"], - # "tailwind installed": lambda ctx: '"tailwindcss":' - # in ctx.files["package.json"], - # "typescript installed": lambda ctx: '"typescript":' - # in ctx.files["package.json"], - # }, - # }, ] +default_test_ids = ["hello", "hello-patch", "hello-ask", "prime100", "init-git"] tests_map = {test["name"]: test for test in tests} +tests_default = [tests_map[test_id] for test_id in default_test_ids] diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 2339c221..3fc5f4a4 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -25,7 +25,7 @@ from tabulate import tabulate from .agents import Agent, GPTMe -from .evals import tests, tests_map +from .evals import tests_default, tests_map from .execenv import SimpleExecutionEnv from .types import ( CaseResult, @@ -324,7 +324,7 @@ def print_model_results_table(model_results: dict[str, list[ExecResult]]): multiple=True, help="Model to use, can be massed multiple times.", ) -@click.option("--timeout", "-t", default=15, help="Timeout for code generation") +@click.option("--timeout", "-t", default=30, help="Timeout for code generation") @click.option("--parallel", "-p", default=10, help="Number of parallel evals to run") def main( eval_names_or_result_files: list[str], @@ -339,17 +339,17 @@ def main( """ models = _model or [ "openai/gpt-4o", - "openai/gpt-4o-mini", + # "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20240620", - "openrouter/meta-llama/llama-3.1-8b-instruct", - "openrouter/meta-llama/llama-3.1-70b-instruct", + # "openrouter/meta-llama/llama-3.1-8b-instruct", + # "openrouter/meta-llama/llama-3.1-70b-instruct", "openrouter/meta-llama/llama-3.1-405b-instruct", - "openrouter/nousresearch/hermes-3-llama-3.1-405b", - "openrouter/microsoft/wizardlm-2-8x22b", - "openrouter/mistralai/mistral-nemo", - "openrouter/mistralai/codestral-mamba", - "openrouter/mistralai/mixtral-8x22b-instruct", - "openrouter/deepseek/deepseek-coder", + # "openrouter/nousresearch/hermes-3-llama-3.1-405b", + # "openrouter/microsoft/wizardlm-2-8x22b", + # "openrouter/mistralai/mistral-nemo", + # "openrouter/mistralai/codestral-mamba", + # "openrouter/mistralai/mixtral-8x22b-instruct", + # "openrouter/deepseek/deepseek-coder", ] results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")] @@ -368,7 +368,7 @@ def main( if test_name not in results_files ] if eval_names_or_result_files - else tests + else tests_default ) if not tests_to_run: sys.exit(0) @@ -439,7 +439,12 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]): writer.writeheader() for model, results in model_results.items(): for result in results: - passed = all(case["passed"] for case in result["results"]) + # Needs to pass all checks, and needs to have results (not empty, as in case of timeout) + passed = ( + all(case["passed"] for case in result["results"]) + if result["results"] + else False + ) writer.writerow( { "Model": model,