Skip to content

Commit

Permalink
tests: added test_eval_cli and cleaned up eval code
Browse files Browse the repository at this point in the history
  • Loading branch information
ErikBjare committed Aug 14, 2024
1 parent e5e2a9a commit c0e5c87
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 22 deletions.
41 changes: 21 additions & 20 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from datetime import datetime
from multiprocessing import Process, Queue
from pathlib import Path
from typing import Literal, Union
from typing import Union

import click
from tabulate import tabulate
Expand All @@ -30,6 +30,7 @@
ExecResult,
ExecTest,
ResultContext,
Status,
)


Expand All @@ -49,7 +50,6 @@ class ProcessError:
duration: float


Status = Literal["success", "error"]
ProcessResult = Union[ProcessSuccess, ProcessError]


Expand Down Expand Up @@ -93,39 +93,38 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
p.start()
p.join(timeout)

time_gen = 0.0
time_run = 0.0
time_eval = 0.0

status: Status = "success"
if p.is_alive():
p.terminate()
p.join()
return {
"name": test["name"],
"status": "timeout",
"results": [],
"timings": {"gen": timeout, "run": 0, "eval": 0},
# TODO: get stdout/stderr for timeouts somehow
"stdout": "",
"stderr": "",
}
status = "timeout"
time_gen = timeout

if queue.empty():
logger.error("Queue is empty, expected a result")
return {
"name": test["name"],
"status": "error",
"results": [],
"timings": {"gen": 0, "run": 0, "eval": 0},
"timings": {"gen": time_gen, "run": time_run, "eval": time_eval},
"stdout": "",
"stderr": "",
}

result = queue.get()
time_gen = result.duration
stdout, stderr = result.stdout, result.stderr

if isinstance(result, ProcessError):
return {
"name": test["name"],
"status": "error",
"results": [],
"timings": {"gen": result.duration, "run": 0, "eval": 0},
"timings": {"gen": time_gen, "run": time_run, "eval": time_eval},
"stdout": stdout,
"stderr": stderr,
}
Expand All @@ -136,12 +135,12 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
run_start = time.time()
env = SimpleExecutionEnv()
env.upload(files)
stdout, stderr, exit_code = env.run(test["run"])
run_duration = time.time() - run_start
stdout_run, stderr_run, exit_code = env.run(test["run"])
time_run = time.time() - run_start

files = env.download()

ctx = ResultContext(files, stdout, stderr, exit_code)
ctx = ResultContext(files, stdout_run, stderr_run, exit_code)
results: list[CaseResult] = []
print(f"\n--- Results for {test['name']} ---")
for name, case in test["expect"].items():
Expand All @@ -160,14 +159,16 @@ def execute(test: ExecTest, agent: Agent, timeout: int) -> ExecResult:
)
print("--- End of results ---\n")

time_eval = sum(r["duration"] for r in results)

return {
"name": test["name"],
"status": "success",
"status": status,
"results": results,
"timings": {
"gen": result.duration,
"run": run_duration,
"eval": sum(r["duration"] for r in results),
"gen": time_gen,
"run": time_run,
"eval": time_eval,
},
"stdout": stdout,
"stderr": stderr,
Expand Down
3 changes: 2 additions & 1 deletion gptme/eval/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Literal, TypedDict

Files = dict[str, str | bytes]
Status = Literal["success", "error", "timeout"]


@dataclass
Expand Down Expand Up @@ -34,7 +35,7 @@ class ExecResult(TypedDict):
"""

name: str
status: Literal["success", "error", "timeout"]
status: Status
results: list[CaseResult]
timings: dict[str, float]
stdout: str
Expand Down
2 changes: 1 addition & 1 deletion gptme/tools/subagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def run_subagent():
initial_msgs = [get_prompt()]

# add the return prompt
return_prompt = """When done with the task, please return a JSON response of this format:
return_prompt = """When done with the task, please return a JSON response on the format:
```json
{
Expand Down
19 changes: 19 additions & 0 deletions tests/test_eval.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
import pytest
from click.testing import CliRunner
from gptme.eval import execute, tests
from gptme.eval.agents import GPTMe
from gptme.eval.main import main


@pytest.mark.slow
def test_eval_cli():
runner = CliRunner()
result = runner.invoke(
main,
[
"--model",
"openai/gpt-4o",
],
)
assert result
assert result.exit_code == 0
assert "correct file" in result.output
assert "correct output" in result.output


@pytest.mark.slow
@pytest.mark.eval
def test_eval(test):
"""
This test will be run for each eval in the tests list.
Expand Down

0 comments on commit c0e5c87

Please sign in to comment.