fix(eval): fixed bug when writing timeout results, improved gptme-eva…

…l defaults (only 3 models + 5 evals)
ErikBjare · Aug 28, 2024 · 707d98c · 707d98c
1 parent 9c13a60
commit 707d98c
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 43 deletions.
diff --git a/Dockerfile.eval b/Dockerfile.eval
@@ -19,4 +19,4 @@ RUN mkdir ./eval_results; chown appuser:appuser ./eval_results
 USER appuser
 
 # Add an entry point for running evals
-ENTRYPOINT ["poetry", "run", "gptme-eval"]
+ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"]
diff --git a/gptme/eval/evals.py b/gptme/eval/evals.py
@@ -4,12 +4,20 @@
     from main import ExecTest
 
 
-def correct_output_hello(ctx):
+def correct_output_hello_world(ctx):
+    return ctx.stdout == "Hello, world!\n"
+
+
+def correct_output_hello_human(ctx):
     return ctx.stdout == "Hello, human!\n"
 
 
-def correct_file_hello(ctx):
-    return ctx.files["hello.py"].strip() == "print('Hello, human!')"
+def check_exists_hello(ctx):
+    return "hello.py" in ctx.files
+
+
+def check_exists_main(ctx):
+    return "main.py" in ctx.files
 
 
 def check_prime_output(ctx):
@@ -24,10 +32,6 @@ def check_clean_working_tree(ctx):
     return "nothing to commit, working tree clean" in ctx.stdout
 
 
-def check_main_py_exists(ctx):
-    return "main.py" in ctx.files
-
-
 def check_commit_exists(ctx):
     return "No commits yet" not in ctx.stdout
 
@@ -55,12 +59,12 @@ def check_cargo_toml(ctx):
 tests: list["ExecTest"] = [
     {
         "name": "hello",
-        "files": {"hello.py": "print('Hello, world!')"},
+        "files": {},
         "run": "python hello.py",
-        "prompt": "Change the code in hello.py to print 'Hello, human!'",
+        "prompt": "write a script hello.py which prints 'Hello, world!'",
         "expect": {
-            "correct output": correct_output_hello,
-            "correct file": correct_file_hello,
+            "correct output": correct_output_hello_world,
+            "correct file": check_exists_hello,
         },
     },
     {
@@ -69,8 +73,8 @@ def check_cargo_toml(ctx):
         "run": "python hello.py",
         "prompt": "Patch the code in hello.py to print 'Hello, human!'",
         "expect": {
-            "correct output": correct_output_hello,
-            "correct file": correct_file_hello,
+            "correct output": correct_output_hello_human,
+            "correct file": check_exists_hello,
         },
     },
     {
@@ -100,7 +104,7 @@ def check_cargo_toml(ctx):
         "expect": {
             "clean exit": check_clean_exit,
             "clean working tree": check_clean_working_tree,
-            "main.py exists": check_main_py_exists,
+            "main.py exists": check_exists_main,
             "we have a commit": check_commit_exists,
         },
     },
@@ -132,21 +136,8 @@ def check_cargo_toml(ctx):
             "correct output": check_output_erik,
         },
     },
-    # Fails, gets stuck on interactive stuff
-    # {
-    #     "name": "init-vue-ts-tailwind",
-    #     "files": {},
-    #     "run": "cat package.json",
-    #     "prompt": "initialize a vue project with typescript and tailwind, make a page that says 'Hello, world!'. avoid interactive tools to initialize the project",
-    #     "expect": {
-    #         "package.json exists": lambda ctx: "package.json" in ctx.files,
-    #         "vue installed": lambda ctx: '"vue":' in ctx.files["package.json"],
-    #         "tailwind installed": lambda ctx: '"tailwindcss":'
-    #         in ctx.files["package.json"],
-    #         "typescript installed": lambda ctx: '"typescript":'
-    #         in ctx.files["package.json"],
-    #     },
-    # },
 ]
 
+default_test_ids = ["hello", "hello-patch", "hello-ask", "prime100", "init-git"]
 tests_map = {test["name"]: test for test in tests}
+tests_default = [tests_map[test_id] for test_id in default_test_ids]
diff --git a/gptme/eval/main.py b/gptme/eval/main.py
@@ -25,7 +25,7 @@
 from tabulate import tabulate
 
 from .agents import Agent, GPTMe
-from .evals import tests, tests_map
+from .evals import tests_default, tests_map
 from .execenv import SimpleExecutionEnv
 from .types import (
     CaseResult,
@@ -324,7 +324,7 @@ def print_model_results_table(model_results: dict[str, list[ExecResult]]):
     multiple=True,
     help="Model to use, can be massed multiple times.",
 )
-@click.option("--timeout", "-t", default=15, help="Timeout for code generation")
+@click.option("--timeout", "-t", default=30, help="Timeout for code generation")
 @click.option("--parallel", "-p", default=10, help="Number of parallel evals to run")
 def main(
     eval_names_or_result_files: list[str],
@@ -339,17 +339,17 @@ def main(
     """
     models = _model or [
         "openai/gpt-4o",
-        "openai/gpt-4o-mini",
+        # "openai/gpt-4o-mini",
         "anthropic/claude-3-5-sonnet-20240620",
-        "openrouter/meta-llama/llama-3.1-8b-instruct",
-        "openrouter/meta-llama/llama-3.1-70b-instruct",
+        # "openrouter/meta-llama/llama-3.1-8b-instruct",
+        # "openrouter/meta-llama/llama-3.1-70b-instruct",
         "openrouter/meta-llama/llama-3.1-405b-instruct",
-        "openrouter/nousresearch/hermes-3-llama-3.1-405b",
-        "openrouter/microsoft/wizardlm-2-8x22b",
-        "openrouter/mistralai/mistral-nemo",
-        "openrouter/mistralai/codestral-mamba",
-        "openrouter/mistralai/mixtral-8x22b-instruct",
-        "openrouter/deepseek/deepseek-coder",
+        # "openrouter/nousresearch/hermes-3-llama-3.1-405b",
+        # "openrouter/microsoft/wizardlm-2-8x22b",
+        # "openrouter/mistralai/mistral-nemo",
+        # "openrouter/mistralai/codestral-mamba",
+        # "openrouter/mistralai/mixtral-8x22b-instruct",
+        # "openrouter/deepseek/deepseek-coder",
     ]
 
     results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
@@ -368,7 +368,7 @@ def main(
             if test_name not in results_files
         ]
         if eval_names_or_result_files
-        else tests
+        else tests_default
     )
     if not tests_to_run:
         sys.exit(0)
@@ -439,7 +439,12 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
         writer.writeheader()
         for model, results in model_results.items():
             for result in results:
-                passed = all(case["passed"] for case in result["results"])
+                # Needs to pass all checks, and needs to have results (not empty, as in case of timeout)
+                passed = (
+                    all(case["passed"] for case in result["results"])
+                    if result["results"]
+                    else False
+                )
                 writer.writerow(
                     {
                         "Model": model,