From e0c79a41a6475190998ae9b4d9c4a378c68124b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Thu, 19 Sep 2024 13:02:49 +0200 Subject: [PATCH] fix: minor eval refactor (improved type names), clarified python tool instructions --- Makefile | 2 +- gptme/eval/main.py | 38 ++++++++++++----------- gptme/eval/run.py | 28 ++++++++--------- gptme/eval/suites/__init__.py | 10 +++--- gptme/eval/suites/basic.py | 12 ++++---- gptme/eval/suites/browser.py | 4 +-- gptme/eval/suites/init_projects.py | 4 +-- gptme/eval/types.py | 8 ++--- gptme/llm.py | 5 +-- gptme/tools/python.py | 49 +++++++++++++++++++----------- gptme/tools/save.py | 21 ++++++++----- gptme/tools/subagent.py | 4 +-- tests/test_cli.py | 3 +- tests/test_eval.py | 4 ++- 14 files changed, 108 insertions(+), 84 deletions(-) diff --git a/Makefile b/Makefile index f7c6c94e..a1230f2d 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ test: @# if SLOW is not set, pass `-m "not slow"` to skip slow tests poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \ --cov=gptme --cov-report=xml --cov-report=term-missing --cov-report=html --junitxml=junit.xml \ - -n 8 \ + -n 16 \ $(if $(EVAL), , -m "not eval") \ $(if $(SLOW), --timeout 60 --retries 2 --retry-delay 5, --timeout 5 -m "not slow and not eval") \ $(if $(PROFILE), --profile-svg) diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 8f94b00d..10b1e1e9 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -20,7 +20,7 @@ from ..message import len_tokens from .run import run_evals from .suites import suites, tests_default, tests_map -from .types import CaseResult, ExecResult, ExecTest +from .types import CaseResult, EvalResult, EvalSpec # Configure logging, including fully-qualified module names logging.basicConfig( @@ -34,7 +34,7 @@ project_dir = Path(__file__).parent.parent.parent -def print_model_results(model_results: dict[str, list[ExecResult]]): +def print_model_results(model_results: dict[str, list[EvalResult]]): total_tests = 0 total_tokens = 0 @@ -70,7 +70,7 @@ def print_model_results(model_results: dict[str, list[ExecResult]]): print(f"Completed {total_tests} tests in {total_tokens}tok") -def print_model_results_table(model_results: dict[str, list[ExecResult]]): +def print_model_results_table(model_results: dict[str, list[EvalResult]]): test_names = { result.name for results in model_results.values() for result in results } @@ -120,19 +120,23 @@ def main( ): """ Run evals for gptme. + Pass eval or suite names to run, or result files to print. - Pass test names to run, or result files to print. + Output from evals will be captured, unless a single eval is run, and saved to the results directory. """ # init multiprocessing_logging.install_mp_handler() models = _model or [ "openai/gpt-4o", + "openai/gpt-4o-mini", "anthropic/claude-3-5-sonnet-20240620", + "anthropic/claude-3-haiku-20240307", "openrouter/meta-llama/llama-3.1-405b-instruct", ] results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")] + eval_names = [f for f in eval_names_or_result_files if f not in results_files] if results_files: for results_file in results_files: p = Path(results_file) @@ -148,20 +152,20 @@ def main( sys.exit(1) sys.exit(0) - tests_to_run: list[ExecTest] = [] - for test_name in eval_names_or_result_files: - if test_name in tests_map: - tests_to_run.append(tests_map[test_name]) - elif test_name in suites: - tests_to_run.extend(suites[test_name]) + evals_to_run: list[EvalSpec] = [] + for eval_name in eval_names: + if test := tests_map.get(eval_name): + evals_to_run.append(test) + elif suite := suites.get(eval_name) or suites.get(eval_name.replace("-", "_")): + evals_to_run.extend(suite) else: - raise ValueError(f"Test {test_name} not found") + raise ValueError(f"Test {eval_name} not found") - if not tests_to_run: - tests_to_run = tests_default + if not evals_to_run: + evals_to_run = tests_default print("=== Running evals ===") - model_results = run_evals(tests_to_run, models, timeout, parallel) + model_results = run_evals(evals_to_run, models, timeout, parallel) print("\n=== Finished ===\n") print("\n=== Model Results ===") @@ -211,7 +215,7 @@ def read_log_file(file_path: Path) -> str: return "" -def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]: +def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]: model_results = defaultdict(list) results_dir = Path(filename).parent with open(filename, newline="") as csvfile: @@ -220,7 +224,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]: model = row["Model"] test_dir = results_dir / model / row["Test"] - result = ExecResult( + result = EvalResult( name=row["Test"], status="success" if row["Passed"] == "true" else "error", results=list(_read_case_results(test_dir / "cases.csv")), @@ -238,7 +242,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]: return dict(model_results) -def write_results(model_results: dict[str, list[ExecResult]]): +def write_results(model_results: dict[str, list[EvalResult]]): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # get current commit hash and dirty status, like: a8b2ef0-dirty # TODO: don't assume we are in the gptme repo, use other version identifiers if available diff --git a/gptme/eval/run.py b/gptme/eval/run.py index 30f8e24c..200c4961 100644 --- a/gptme/eval/run.py +++ b/gptme/eval/run.py @@ -19,8 +19,8 @@ from .execenv import SimpleExecutionEnv from .types import ( CaseResult, - ExecResult, - ExecTest, + EvalResult, + EvalSpec, ResultContext, Status, ) @@ -52,8 +52,8 @@ class SyncedDict(TypedDict): def run_evals( - tests: list[ExecTest], models: list[str], timeout: int, parallel: int -) -> dict[str, list[ExecResult]]: + evals: list[EvalSpec], models: list[str], timeout: int, parallel: int +) -> dict[str, list[EvalResult]]: """ Run evals for a list of tests. """ @@ -67,14 +67,14 @@ def run_evals( else: cleanup_on_sigterm() - n_runs = len(tests) * len(models) - model_results: dict[str, dict[str, ExecResult]] = defaultdict(dict) + n_runs = len(evals) * len(models) + model_results: dict[str, dict[str, EvalResult]] = defaultdict(dict) parallel = min(n_runs, parallel) with ProcessPoolExecutor(parallel) as executor: futures = [] future_to_model_test = {} for model in models: - for test in tests: + for test in evals: future = executor.submit( execute, test, @@ -103,7 +103,7 @@ def _handle_future(future: Future): logger.exception( f"Test {test_name} for model {model} generated an exception when trying to get result" ) - result = ExecResult( + result = EvalResult( name=test_name, status=status, results=[], @@ -116,7 +116,7 @@ def _handle_future(future: Future): model_results[model][test_name] = result # worse-case run time, with some buffer to account for overhead - max_timeout = timeout * len(tests) / parallel + 10 + max_timeout = timeout * len(evals) / parallel + 10 completed = set() try: # TODO: can we do better than this? handle timeouts within futures instead? @@ -147,19 +147,19 @@ def _handle_future(future: Future): process.terminate() process.join() - model_results_final: dict[str, list[ExecResult]] = defaultdict(list) + model_results_final: dict[str, list[EvalResult]] = defaultdict(list) for model in model_results: # sort results by test order model_results_final[model] = sorted( model_results[model].values(), - key=lambda result: [test["name"] for test in tests].index(result.name), + key=lambda result: [test["name"] for test in evals].index(result.name), ) return model_results_final # TODO: rewrite to run in Docker? Would help with capturing output + process management. -def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecResult: +def execute(test: EvalSpec, agent: Agent, timeout: int, parallel: bool) -> EvalResult: """ Executes the code for a specific model with a timeout. """ @@ -206,7 +206,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR gen_stderr = result.get("stderr", "") else: logger.error("No result in shared dictionary") - return ExecResult( + return EvalResult( name=test["name"], status="error", results=[], @@ -256,7 +256,7 @@ def execute(test: ExecTest, agent: Agent, timeout: int, parallel: bool) -> ExecR results = [] stdout_run, stderr_run = "", "" - return ExecResult( + return EvalResult( name=test["name"], status=status, results=results, diff --git a/gptme/eval/suites/__init__.py b/gptme/eval/suites/__init__.py index 4b34e8a1..a3ce9896 100644 --- a/gptme/eval/suites/__init__.py +++ b/gptme/eval/suites/__init__.py @@ -1,16 +1,16 @@ -from ..types import ExecTest +from ..types import EvalSpec from .basic import tests as tests_basic from .browser import tests as tests_browser from .init_projects import tests as tests_init_projects -suites: dict[str, list[ExecTest]] = { +suites: dict[str, list[EvalSpec]] = { "basic": tests_basic, "init_projects": tests_init_projects, "browser": tests_browser, } -tests: list[ExecTest] = [test for suite in suites.values() for test in suite] -tests_map: dict[str, ExecTest] = {test["name"]: test for test in tests} +tests: list[EvalSpec] = [test for suite in suites.values() for test in suite] +tests_map: dict[str, EvalSpec] = {test["name"]: test for test in tests} tests_default_ids: list[str] = [ "hello", @@ -19,4 +19,4 @@ "prime100", "init-git", ] -tests_default: list[ExecTest] = [tests_map[test_id] for test_id in tests_default_ids] +tests_default: list[EvalSpec] = [tests_map[test_id] for test_id in tests_default_ids] diff --git a/gptme/eval/suites/basic.py b/gptme/eval/suites/basic.py index ae6c4538..5c7920d2 100644 --- a/gptme/eval/suites/basic.py +++ b/gptme/eval/suites/basic.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from gptme.eval.main import ExecTest + from gptme.eval.main import EvalSpec def correct_output_hello_world(ctx): @@ -28,12 +28,12 @@ def check_output_hello_ask(ctx): return "Hello, Erik!" in ctx.stdout -tests: list["ExecTest"] = [ +tests: list["EvalSpec"] = [ { "name": "hello", "files": {}, "run": "python hello.py", - "prompt": "write a script hello.py which prints 'Hello, world!'", + "prompt": 'write a script hello.py which prints "Hello, world!"', "expect": { "correct output": correct_output_hello_world, "correct file": check_exists_hello, @@ -41,9 +41,9 @@ def check_output_hello_ask(ctx): }, { "name": "hello-patch", - "files": {"hello.py": "print('Hello, world!')"}, + "files": {"hello.py": 'print("Hello, world!")'}, "run": "python hello.py", - "prompt": "Patch the code in hello.py to print 'Hello, human!'", + "prompt": 'Patch the code in hello.py to print "Hello, human!"', "expect": { "correct output": correct_output_hello_human, "correct file": check_exists_hello, @@ -51,7 +51,7 @@ def check_output_hello_ask(ctx): }, { "name": "hello-ask", - "files": {"hello.py": "print('Hello, world!')"}, + "files": {"hello.py": 'print("Hello, world!")'}, "run": "echo 'Erik' | python hello.py", # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode "prompt": "modify hello.py to ask the user for their name and print 'Hello, !'. don't try to execute it", diff --git a/gptme/eval/suites/browser.py b/gptme/eval/suites/browser.py index e3bbb17e..8721c3cb 100644 --- a/gptme/eval/suites/browser.py +++ b/gptme/eval/suites/browser.py @@ -1,14 +1,14 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from gptme.eval.main import ExecTest + from gptme.eval.main import EvalSpec def check_output_erik(ctx): return "Erik" in ctx.stdout -tests: list["ExecTest"] = [ +tests: list["EvalSpec"] = [ { "name": "whois-superuserlabs-ceo", "files": {}, diff --git a/gptme/eval/suites/init_projects.py b/gptme/eval/suites/init_projects.py index 80c9773a..bf7a921d 100644 --- a/gptme/eval/suites/init_projects.py +++ b/gptme/eval/suites/init_projects.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from gptme.eval.main import ExecTest + from gptme.eval.main import EvalSpec def check_clean_exit(ctx): @@ -41,7 +41,7 @@ def check_exists_main(ctx): return "main.py" in ctx.files -tests: list["ExecTest"] = [ +tests: list["EvalSpec"] = [ { "name": "init-git", "files": {}, diff --git a/gptme/eval/types.py b/gptme/eval/types.py index 0ced4800..a9395537 100644 --- a/gptme/eval/types.py +++ b/gptme/eval/types.py @@ -31,9 +31,9 @@ class CaseResult: @dataclass -class ExecResult: +class EvalResult: """ - Result of executing a prompt. + Result of executing an eval. """ name: str @@ -46,9 +46,9 @@ class ExecResult: run_stderr: str -class ExecTest(TypedDict): +class EvalSpec(TypedDict): """ - Test case for executing a prompt. + Specification for an eval/test case. """ name: str diff --git a/gptme/llm.py b/gptme/llm.py index 784eb931..e0d4ef5d 100644 --- a/gptme/llm.py +++ b/gptme/llm.py @@ -92,10 +92,11 @@ def print_clear(): # need to flush stdout to get the print to show up sys.stdout.flush() + # TODO: make this more robust/general, maybe with a callback that runs on each char/chunk # pause inference on finished code-block, letting user run the command before continuing tooluses = list(ToolUse.iter_from_content(output)) - if tooluses: - logger.debug("Found tool use, breaking") + if tooluses and any(tooluse.is_runnable for tooluse in tooluses): + logger.warning("Found tool use, breaking") break except KeyboardInterrupt: return Message("assistant", output + "... ^C Interrupted") diff --git a/gptme/tools/python.py b/gptme/tools/python.py index 8236ac61..f8cdb4ea 100644 --- a/gptme/tools/python.py +++ b/gptme/tools/python.py @@ -151,32 +151,45 @@ def get_installed_python_libraries() -> set[str]: return installed +instructions = """ +To execute Python code in an interactive IPython session, send a codeblock using the `ipython` language tag. +It will respond with the output and result of the execution. +Assistant may first write the code in a normal python codeblock, then execute it in an IPython codeblock. +""" + + examples = f""" #### Results of the last expression will be displayed, IPython-style: -User: What is 2 + 2? -Assistant: -{ToolUse("python", [], "2 + 2").to_output()} -System: Executed code block. -```stdout +> User: What is 2 + 2? +> Assistant: +{ToolUse("ipython", [], "2 + 2").to_output()} +> System: Executed code block. +```result 4 ``` -#### The user can also run Python code with the /python command: - -User: /python 2 + 2 -System: Executed code block. -```stdout -4 +#### It can write an example and then execute it: +> User: compute fib 10 +> Assistant: To compute the 10th Fibonacci number, we write a recursive function: +```python +def fib(n): + ... +``` +Now, let's execute this code to get the 10th Fibonacci number: +{ToolUse("ipython", [], ''' +def fib(n): + if n <= 1: + return n + return fib(n - 1) + fib(n - 2) +fib(10) +''').to_output()} +> System: Executed code block. +```result +55 ``` """.strip() -instructions = """ -When you send a message containing Python code (and is not a file block), it will be executed in a stateful environment. -Python will respond with the output of the execution. -""" - - # only used for doc generation, use get_tool() in the code tool = ToolSpec( name="python", @@ -185,7 +198,7 @@ def get_installed_python_libraries() -> set[str]: examples=examples, execute=execute_python, block_types=[ - "python", + # "python", "ipython", "py", ], diff --git a/gptme/tools/save.py b/gptme/tools/save.py index e9489641..3e6a7d66 100644 --- a/gptme/tools/save.py +++ b/gptme/tools/save.py @@ -1,5 +1,5 @@ """ -Gives the assistant the ability to save code to a file. +Gives the assistant the ability to save/write code to a file. """ from collections.abc import Generator @@ -9,13 +9,18 @@ from ..util import ask_execute from .base import ToolSpec, ToolUse +# FIXME: this is markdown-specific instructions, thus will confuse the XML mode instructions = """ -To save code to a file, use a code block with the filepath as the language. +To write text to a file, use a code block with the language tag set to the path of the file. """.strip() examples = f""" -> User: write a Hello world script to hello.py -{ToolUse("save", ["hello.py"], "print('Hello world')").to_output()} +> User: write a hello world script to hello.py +{ToolUse("save", ["hello.py"], 'print("Hello world")').to_output()} +> System: Saved to `hello.py` + +> User: make it all-caps +{ToolUse("save", ["hello.py"], 'print("HELLO WORLD")').to_output()} > System: Saved to `hello.py` """.strip() @@ -119,7 +124,7 @@ def execute_append( tool_save = ToolSpec( name="save", - desc="Save code to a file", + desc="Write text to file", instructions=instructions, examples=examples, execute=execute_save, @@ -128,19 +133,19 @@ def execute_append( __doc__ = tool_save.get_doc(__doc__) instructions_append = """ -To append code to a file, use a code block with the language: append +To append text to a file, use a code block with the language: append """.strip() examples_append = f""" > User: append a print "Hello world" to hello.py > Assistant: -{ToolUse("append", ["hello.py"], "print('Hello world')").to_output()} +{ToolUse("append", ["hello.py"], 'print("Hello world")').to_output()} > System: Appended to `hello.py` """.strip() tool_append = ToolSpec( name="append", - desc="Append code to a file", + desc="Append text to file", instructions=instructions_append, examples=examples_append, execute=execute_append, diff --git a/gptme/tools/subagent.py b/gptme/tools/subagent.py index f9f2b9c4..e0d2a4cd 100644 --- a/gptme/tools/subagent.py +++ b/gptme/tools/subagent.py @@ -155,10 +155,10 @@ def subagent_wait(agent_id: str) -> dict: examples = f""" User: compute fib 69 using a subagent Assistant: Starting a subagent to compute the 69th Fibonacci number. -{ToolUse("python", [], 'subagent("compute the 69th Fibonacci number", "fib-69")').to_output()} +{ToolUse("ipython", [], 'subagent("compute the 69th Fibonacci number", "fib-69")').to_output()} System: Subagent started successfully. Assistant: Now we need to wait for the subagent to finish the task. -{ToolUse("python", [], 'subagent_wait("fib-69")').to_output()} +{ToolUse("ipython", [], 'subagent_wait("fib-69")').to_output()} """ diff --git a/tests/test_cli.py b/tests/test_cli.py index eccd3554..e38ac302 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -225,10 +225,9 @@ def test_block(args: list[str], lang: str, runner: CliRunner): assert result.exit_code == 0 -# TODO: these could be fast if we had a cache @pytest.mark.slow def test_generate_primes(args: list[str], runner: CliRunner): - args.append("print the first 10 prime numbers") + args.append("compute the first 10 prime numbers") result = runner.invoke(gptme.cli.main, args) # check that the 9th and 10th prime is present assert "23" in result.output diff --git a/tests/test_eval.py b/tests/test_eval.py index 1b975c79..8fd59f9b 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -49,13 +49,15 @@ def test_eval(test): provider = _detect_model() agent = GPTMe(provider) result = execute(test, agent, timeout=30, parallel=False) + assert result.results assert all(case.passed for case in result.results) # Hook to generate tests from the tests list def pytest_generate_tests(metafunc): if "test" in metafunc.fixturenames: - allowlist = ["hello"] # for now, only run the hello test + # for now, only run the hello-patch test (the "hello" test is unreliable with gpt-4o-mini) + allowlist = ["hello-patch"] test_set, test_names = zip( *[(test, test["name"]) for test in tests if test["name"] in allowlist] )