Skip to content

Commit

Permalink
fix: minor eval refactor (improved type names), clarified python tool…
Browse files Browse the repository at this point in the history
… instructions
  • Loading branch information
ErikBjare committed Sep 19, 2024
1 parent a837b32 commit e0c79a4
Show file tree
Hide file tree
Showing 14 changed files with 108 additions and 84 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ test:
@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
--cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html --junitxml=junit.xml \
-n 8 \
-n 16 \
$(if $(EVAL), , -m "not eval") \
$(if $(SLOW), --timeout 60 --retries 2 --retry-delay 5, --timeout 5 -m "not slow and not eval") \
$(if $(PROFILE), --profile-svg)
Expand Down
38 changes: 21 additions & 17 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..message import len_tokens
from .run import run_evals
from .suites import suites, tests_default, tests_map
from .types import CaseResult, ExecResult, ExecTest
from .types import CaseResult, EvalResult, EvalSpec

# Configure logging, including fully-qualified module names
logging.basicConfig(
Expand All @@ -34,7 +34,7 @@
project_dir = Path(__file__).parent.parent.parent


def print_model_results(model_results: dict[str, list[ExecResult]]):
def print_model_results(model_results: dict[str, list[EvalResult]]):
total_tests = 0
total_tokens = 0

Expand Down Expand Up @@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]):
print(f"Completed {total_tests} tests in {total_tokens}tok")


def print_model_results_table(model_results: dict[str, list[ExecResult]]):
def print_model_results_table(model_results: dict[str, list[EvalResult]]):
test_names = {
result.name for results in model_results.values() for result in results
}
Expand Down Expand Up @@ -120,19 +120,23 @@ def main(
):
"""
Run evals for gptme.
Pass eval or suite names to run, or result files to print.
Pass test names to run, or result files to print.
Output from evals will be captured, unless a single eval is run, and saved to the results directory.
"""
# init
multiprocessing_logging.install_mp_handler()

models = _model or [
"openai/gpt-4o",
"openai/gpt-4o-mini",
"anthropic/claude-3-5-sonnet-20240620",
"anthropic/claude-3-haiku-20240307",
"openrouter/meta-llama/llama-3.1-405b-instruct",
]

results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
eval_names = [f for f in eval_names_or_result_files if f not in results_files]
if results_files:
for results_file in results_files:
p = Path(results_file)
Expand All @@ -148,20 +152,20 @@ def main(
sys.exit(1)
sys.exit(0)

tests_to_run: list[ExecTest] = []
for test_name in eval_names_or_result_files:
if test_name in tests_map:
tests_to_run.append(tests_map[test_name])
elif test_name in suites:
tests_to_run.extend(suites[test_name])
evals_to_run: list[EvalSpec] = []
for eval_name in eval_names:
if test := tests_map.get(eval_name):
evals_to_run.append(test)
elif suite := suites.get(eval_name) or suites.get(eval_name.replace("-", "_")):
evals_to_run.extend(suite)
else:
raise ValueError(f"Test {test_name} not found")
raise ValueError(f"Test {eval_name} not found")

if not tests_to_run:
tests_to_run = tests_default
if not evals_to_run:
evals_to_run = tests_default

print("=== Running evals ===")
model_results = run_evals(tests_to_run, models, timeout, parallel)
model_results = run_evals(evals_to_run, models, timeout, parallel)
print("\n=== Finished ===\n")

print("\n=== Model Results ===")
Expand Down Expand Up @@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str:
return ""


def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:
model_results = defaultdict(list)
results_dir = Path(filename).parent
with open(filename, newline="") as csvfile:
Expand All @@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
model = row["Model"]
test_dir = results_dir / model / row["Test"]

result = ExecResult(
result = EvalResult(
name=row["Test"],
status="success" if row["Passed"] == "true" else "error",
results=list(_read_case_results(test_dir / "cases.csv")),
Expand All @@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
return dict(model_results)


def write_results(model_results: dict[str, list[ExecResult]]):
def write_results(model_results: dict[str, list[EvalResult]]):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# get current commit hash and dirty status, like: a8b2ef0-dirty
# TODO: don't assume we are in the gptme repo, use other version identifiers if available
Expand Down
28 changes: 14 additions & 14 deletions gptme/eval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from .execenv import SimpleExecutionEnv
from .types import (
CaseResult,
ExecResult,
ExecTest,
EvalResult,
EvalSpec,
ResultContext,
Status,
)
Expand Down Expand Up @@ -52,8 +52,8 @@ class SyncedDict(TypedDict):


def run_evals(
tests: list[ExecTest], models: list[str], timeout: int, parallel: int
) -> dict[str, list[ExecResult]]:
evals: list[EvalSpec], models: list[str], timeout: int, parallel: int
) -> dict[str, list[EvalResult]]:
"""
Run evals for a list of tests.
"""
Expand All @@ -67,14 +67,14 @@ def run_evals(
else:
cleanup_on_sigterm()

n_runs = len(tests) * len(models)
model_results: dict[str, dict[str, ExecResult]] = defaultdict(dict)
n_runs = len(evals) * len(models)
model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict)
parallel = min(n_runs, parallel)
with ProcessPoolExecutor(parallel) as executor:
futures = []
future_to_model_test = {}
for model in models:
for test in tests:
for test in evals:
future = executor.submit(
execute,
test,
Expand Down Expand Up @@ -103,7 +103,7 @@ def _handle_future(future: Future):
logger.exception(
f"Test {test_name} for model {model} generated an exception when trying to get result"
)
result = ExecResult(
result = EvalResult(
name=test_name,
status=status,
results=[],
Expand All @@ -116,7 +116,7 @@ def _handle_future(future: Future):
model_results[model][test_name] = result

# worse-case run time, with some buffer to account for overhead
max_timeout = timeout * len(tests) / parallel + 10
max_timeout = timeout * len(evals) / parallel + 10
completed = set()
try:
# TODO: can we do better than this? handle timeouts within futures instead?
Expand Down Expand Up @@ -147,19 +147,19 @@ def _handle_future(future: Future):
process.terminate()
process.join()

model_results_final: dict[str, list[ExecResult]] = defaultdict(list)
model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
for model in model_results:
# sort results by test order
model_results_final[model] = sorted(
model_results[model].values(),
key=lambda result: [test["name"] for test in tests].index(result.name),
key=lambda result: [test["name"] for test in evals].index(result.name),
)

return model_results_final


# TODO: rewrite to run in Docker? Would help with capturing output + process management.
def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecResult:
def execute(test: EvalSpec, agent: Agent, timeout: int, parallel: bool) -> EvalResult:
"""
Executes the code for a specific model with a timeout.
"""
Expand Down Expand Up @@ -206,7 +206,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
gen_stderr = result.get("stderr", "")
else:
logger.error("No result in shared dictionary")
return ExecResult(
return EvalResult(
name=test["name"],
status="error",
results=[],
Expand Down Expand Up @@ -256,7 +256,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR
results = []
stdout_run, stderr_run = "", ""

return ExecResult(
return EvalResult(
name=test["name"],
status=status,
results=results,
Expand Down
10 changes: 5 additions & 5 deletions gptme/eval/suites/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from ..types import ExecTest
from ..types import EvalSpec
from .basic import tests as tests_basic
from .browser import tests as tests_browser
from .init_projects import tests as tests_init_projects

suites: dict[str, list[ExecTest]] = {
suites: dict[str, list[EvalSpec]] = {
"basic": tests_basic,
"init_projects": tests_init_projects,
"browser": tests_browser,
}

tests: list[ExecTest] = [test for suite in suites.values() for test in suite]
tests_map: dict[str, ExecTest] = {test["name"]: test for test in tests}
tests: list[EvalSpec] = [test for suite in suites.values() for test in suite]
tests_map: dict[str, EvalSpec] = {test["name"]: test for test in tests}

tests_default_ids: list[str] = [
"hello",
Expand All @@ -19,4 +19,4 @@
"prime100",
"init-git",
]
tests_default: list[ExecTest] = [tests_map[test_id] for test_id in tests_default_ids]
tests_default: list[EvalSpec] = [tests_map[test_id] for test_id in tests_default_ids]
12 changes: 6 additions & 6 deletions gptme/eval/suites/basic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from gptme.eval.main import ExecTest
from gptme.eval.main import EvalSpec


def correct_output_hello_world(ctx):
Expand All @@ -28,30 +28,30 @@ def check_output_hello_ask(ctx):
return "Hello, Erik!" in ctx.stdout


tests: list["ExecTest"] = [
tests: list["EvalSpec"] = [
{
"name": "hello",
"files": {},
"run": "python hello.py",
"prompt": "write a script hello.py which prints 'Hello, world!'",
"prompt": 'write a script hello.py which prints "Hello, world!"',
"expect": {
"correct output": correct_output_hello_world,
"correct file": check_exists_hello,
},
},
{
"name": "hello-patch",
"files": {"hello.py": "print('Hello, world!')"},
"files": {"hello.py": 'print("Hello, world!")'},
"run": "python hello.py",
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
"prompt": 'Patch the code in hello.py to print "Hello, human!"',
"expect": {
"correct output": correct_output_hello_human,
"correct file": check_exists_hello,
},
},
{
"name": "hello-ask",
"files": {"hello.py": "print('Hello, world!')"},
"files": {"hello.py": 'print("Hello, world!")'},
"run": "echo 'Erik' | python hello.py",
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
Expand Down
4 changes: 2 additions & 2 deletions gptme/eval/suites/browser.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from gptme.eval.main import ExecTest
from gptme.eval.main import EvalSpec


def check_output_erik(ctx):
return "Erik" in ctx.stdout


tests: list["ExecTest"] = [
tests: list["EvalSpec"] = [
{
"name": "whois-superuserlabs-ceo",
"files": {},
Expand Down
4 changes: 2 additions & 2 deletions gptme/eval/suites/init_projects.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from gptme.eval.main import ExecTest
from gptme.eval.main import EvalSpec


def check_clean_exit(ctx):
Expand Down Expand Up @@ -41,7 +41,7 @@ def check_exists_main(ctx):
return "main.py" in ctx.files


tests: list["ExecTest"] = [
tests: list["EvalSpec"] = [
{
"name": "init-git",
"files": {},
Expand Down
8 changes: 4 additions & 4 deletions gptme/eval/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ class CaseResult:


@dataclass
class ExecResult:
class EvalResult:
"""
Result of executing a prompt.
Result of executing an eval.
"""

name: str
Expand All @@ -46,9 +46,9 @@ class ExecResult:
run_stderr: str


class ExecTest(TypedDict):
class EvalSpec(TypedDict):
"""
Test case for executing a prompt.
Specification for an eval/test case.
"""

name: str
Expand Down
5 changes: 3 additions & 2 deletions gptme/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,11 @@ def print_clear():
# need to flush stdout to get the print to show up
sys.stdout.flush()

# TODO: make this more robust/general, maybe with a callback that runs on each char/chunk
# pause inference on finished code-block, letting user run the command before continuing
tooluses = list(ToolUse.iter_from_content(output))
if tooluses:
logger.debug("Found tool use, breaking")
if tooluses and any(tooluse.is_runnable for tooluse in tooluses):
logger.warning("Found tool use, breaking")
break
except KeyboardInterrupt:
return Message("assistant", output + "... ^C Interrupted")
Expand Down
Loading

0 comments on commit e0c79a4

Please sign in to comment.