diff --git a/gptme/eval/evals.py b/gptme/eval/evals.py index 687fa18a..2b57e33d 100644 --- a/gptme/eval/evals.py +++ b/gptme/eval/evals.py @@ -36,6 +36,22 @@ def check_output_hello_ask(ctx): return "Hello, Erik!" in ctx.stdout +def check_package_json(ctx): + return "package.json" in ctx.files + + +def check_output_compiled_successfully(ctx): + return "Compiled successfully" in ctx.stdout + + +def check_output_erik(ctx): + return "Erik" in ctx.stdout + + +def check_cargo_toml(ctx): + return "Cargo.toml" in ctx.files + + tests: list["ExecTest"] = [ { "name": "hello", @@ -88,6 +104,34 @@ def check_output_hello_ask(ctx): "we have a commit": check_commit_exists, }, }, + { + "name": "init-react", + "files": {}, + "run": "npm run build", + "prompt": "create a react project in the current directory, try to build it, but dont start the server and dont use git", + "expect": { + "package.json exists": check_package_json, + "builds successfully": check_output_compiled_successfully, + }, + }, + { + "name": "init-rust", + "files": {}, + "run": "cargo build", + "prompt": "create a Rust project in the current directory", + "expect": { + "Cargo.toml exists": check_cargo_toml, + }, + }, + { + "name": "whois-superuserlabs-ceo", + "files": {}, + "run": "cat answer.txt", + "prompt": "who is the CEO of Superuser Labs? write the answer to answer.txt", + "expect": { + "correct output": check_output_erik, + }, + }, # Fails, gets stuck on interactive stuff # { # "name": "init-vue-ts-tailwind", diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 1b553b40..4d89c701 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -8,6 +8,7 @@ import inspect import io import logging +import os import signal import subprocess import sys @@ -56,6 +57,8 @@ class ProcessError: def act_process(agent, files, prompt, queue: "Queue[ProcessResult]"): # Runs in a process for each eval + # each eval has a process group, so we can kill all child processes + os.setpgrp() # redirect stdout and stderr to streams stdout, stderr = io.StringIO(), io.StringIO() @@ -67,6 +70,8 @@ def error_handler(e): sys.stdout, sys.stderr = stdout_orig, stderr_orig print(f"Error: {e}") queue.put(ProcessError(str(e), stdout.getvalue(), stderr.getvalue(), duration)) + # kill child processes + os.killpg(0, signal.SIGKILL) sys.exit(1) # handle SIGTERM @@ -98,7 +103,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: Executes the code for a specific model with a timeout. """ print( - f"Running test {test['name']} with prompt: {test['prompt']} for model: {agent.model}" + f'Running "{test["name"]}" with prompt "{test["prompt"]}" for model: {agent.model}' ) queue: Queue[ProcessResult] = Queue() @@ -114,7 +119,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: if p.is_alive(): print("Timeout reached, terminating process") p.terminate() - p.join() + p.join(timeout=1) status = "timeout" time_gen = timeout @@ -129,7 +134,9 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: "stderr": "", } - result = queue.get() + logger.info("Getting result from queue") + result = queue.get(timeout=1) + logger.info("Got result") if status == "success": time_gen = result.duration stdout, stderr = result.stdout, result.stderr @@ -150,6 +157,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult: run_start = time.time() env = SimpleExecutionEnv() env.upload(files) + logger.info(f"Running check: {test['run']}") stdout_run, stderr_run, exit_code = env.run(test["run"]) time_run = time.time() - run_start