Skip to content

Commit

Permalink
fix(eval): cleanup evals' children processes using process group, add…
Browse files Browse the repository at this point in the history
…ed more evals init-react, init-rust, whois-superuserlabs-ceo
  • Loading branch information
ErikBjare committed Aug 21, 2024
1 parent b47937a commit 56062ce
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
44 changes: 44 additions & 0 deletions gptme/eval/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,22 @@ def check_output_hello_ask(ctx):
return "Hello, Erik!" in ctx.stdout


def check_package_json(ctx):
return "package.json" in ctx.files


def check_output_compiled_successfully(ctx):
return "Compiled successfully" in ctx.stdout


def check_output_erik(ctx):
return "Erik" in ctx.stdout


def check_cargo_toml(ctx):
return "Cargo.toml" in ctx.files


tests: list["ExecTest"] = [
{
"name": "hello",
Expand Down Expand Up @@ -88,6 +104,34 @@ def check_output_hello_ask(ctx):
"we have a commit": check_commit_exists,
},
},
{
"name": "init-react",
"files": {},
"run": "npm run build",
"prompt": "create a react project in the current directory, try to build it, but dont start the server and dont use git",
"expect": {
"package.json exists": check_package_json,
"builds successfully": check_output_compiled_successfully,
},
},
{
"name": "init-rust",
"files": {},
"run": "cargo build",
"prompt": "create a Rust project in the current directory",
"expect": {
"Cargo.toml exists": check_cargo_toml,
},
},
{
"name": "whois-superuserlabs-ceo",
"files": {},
"run": "cat answer.txt",
"prompt": "who is the CEO of Superuser Labs? write the answer to answer.txt",
"expect": {
"correct output": check_output_erik,
},
},
# Fails, gets stuck on interactive stuff
# {
# "name": "init-vue-ts-tailwind",
Expand Down
14 changes: 11 additions & 3 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import inspect
import io
import logging
import os
import signal
import subprocess
import sys
Expand Down Expand Up @@ -56,6 +57,8 @@ class ProcessError:

def act_process(agent, files, prompt, queue: "Queue[ProcessResult]"):
# Runs in a process for each eval
# each eval has a process group, so we can kill all child processes
os.setpgrp()

# redirect stdout and stderr to streams
stdout, stderr = io.StringIO(), io.StringIO()
Expand All @@ -67,6 +70,8 @@ def error_handler(e):
sys.stdout, sys.stderr = stdout_orig, stderr_orig
print(f"Error: {e}")
queue.put(ProcessError(str(e), stdout.getvalue(), stderr.getvalue(), duration))
# kill child processes
os.killpg(0, signal.SIGKILL)
sys.exit(1)

# handle SIGTERM
Expand Down Expand Up @@ -98,7 +103,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
Executes the code for a specific model with a timeout.
"""
print(
f"Running test {test['name']} with prompt: {test['prompt']} for model: {agent.model}"
f'Running "{test["name"]}" with prompt "{test["prompt"]}" for model: {agent.model}'
)

queue: Queue[ProcessResult] = Queue()
Expand All @@ -114,7 +119,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
if p.is_alive():
print("Timeout reached, terminating process")
p.terminate()
p.join()
p.join(timeout=1)
status = "timeout"
time_gen = timeout

Expand All @@ -129,7 +134,9 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
"stderr": "",
}

result = queue.get()
logger.info("Getting result from queue")
result = queue.get(timeout=1)
logger.info("Got result")
if status == "success":
time_gen = result.duration
stdout, stderr = result.stdout, result.stderr
Expand All @@ -150,6 +157,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
run_start = time.time()
env = SimpleExecutionEnv()
env.upload(files)
logger.info(f"Running check: {test['run']}")
stdout_run, stderr_run, exit_code = env.run(test["run"])
time_run = time.time() - run_start

Expand Down

0 comments on commit 56062ce

Please sign in to comment.