From bfd1b25c9d178b15b2851c44283fec4f27fb0c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Fri, 27 Sep 2024 18:42:04 +0200 Subject: [PATCH] feat: major improvments to evals, can now aggregate eval_results and run rust+browser+git in Docker (passes evals) --- Dockerfile.eval | 32 +++++++++++++---- Makefile | 3 ++ gptme/eval/main.py | 90 ++++++++++++++++++++++++++++++++++++---------- gptme/eval/run.py | 2 +- 4 files changed, 101 insertions(+), 26 deletions(-) diff --git a/Dockerfile.eval b/Dockerfile.eval index a3ec8d8a..1977cf5f 100644 --- a/Dockerfile.eval +++ b/Dockerfile.eval @@ -1,24 +1,44 @@ # Use the main Dockerfile as the base image FROM gptme:latest AS base -# Set build argument for browser -ARG BROWSER=no +SHELL ["/bin/bash", "-c"] + +# Switch back to gptme directory (not /workspace) +WORKDIR /app + +# Install rust if enabled +ARG RUST=no +USER appuser +RUN if [ "$RUST" = "yes" ]; then \ + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal; \ + source $HOME/.cargo/env; rustup default stable; \ + fi +ENV PATH="/home/appuser/.cargo/bin:${PATH}" -# Switch back to root for installations +# Install necessary build tools if rust is enabled USER root +RUN if [ "$RUST" = "yes" ]; then \ + apt-get update && apt-get install build-essential -y; \ + fi # Install playwright if browser is enabled +ARG BROWSER=no +USER root RUN if [ "$BROWSER" = "yes" ]; then \ - poetry run playwright install chromium; \ + poetry install -E browser --without=dev; \ + poetry run playwright install-deps; \ + su appuser -c "poetry run playwright install chromium"; \ fi -WORKDIR /app - # Create eval_results directory RUN mkdir ./eval_results; chown appuser:appuser ./eval_results # Switch back to non-root user USER appuser +RUN git config --global user.name "gptme" +RUN git config --global user.email "gptme@superuserlabs.org" +RUN git config --global init.defaultBranch main + # Add an entry point for running evals ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"] diff --git a/Makefile b/Makefile index e6e35140..9bc737b0 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,9 @@ build-docker: docker build . -t gptme:latest -f Dockerfile docker build . -t gptme-eval:latest -f Dockerfile.eval +build-docker-full: + docker build . -t gptme-eval:latest -f Dockerfile.eval --build-arg RUST=yes --build-arg BROWSER=yes + test: @# if SLOW is not set, pass `-m "not slow"` to skip slow tests poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \ diff --git a/gptme/eval/main.py b/gptme/eval/main.py index 10b1e1e9..919039aa 100644 --- a/gptme/eval/main.py +++ b/gptme/eval/main.py @@ -10,7 +10,7 @@ import sys from collections import defaultdict from collections.abc import Generator -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path import click @@ -101,6 +101,64 @@ def print_model_results_table(model_results: dict[str, list[EvalResult]]): print(tabulate(table_data, headers=headers)) +def aggregate_and_display_results(result_files: list[str]): + all_results: dict[str, dict[str, dict]] = {} + for file in result_files: + for model, model_results in read_results_from_csv(file).items(): + if model not in all_results: + all_results[model] = {} + for result in model_results: + if result.name not in all_results[model]: + all_results[model][result.name] = { + "total": 0, + "passed": 0, + "tokens": 0, + } + all_results[model][result.name]["total"] += 1 + all_results[model][result.name]["tokens"] += len_tokens( + result.gen_stdout + ) + len_tokens(result.run_stdout) + if result.status == "success" and all( + case.passed for case in result.results + ): + all_results[model][result.name]["passed"] += 1 + + # Prepare table data + headers = ["Model"] + list( + set( + test + for model_results in all_results.values() + for test in model_results.keys() + ) + ) + table_data = [] + + def get_status_emoji(passed, total): + percentage = (passed / total) * 100 + if percentage == 100: + return "✅" + elif 20 <= percentage < 80: + return "🔶" + else: + return "❌" + + for model, results in all_results.items(): + row = [model] + for test in headers[1:]: + if test in results: + passed = results[test]["passed"] + total = results[test]["total"] + tokens = results[test]["tokens"] + status_emoji = get_status_emoji(passed, total) + row.append(f"{status_emoji} {passed}/{total} {tokens}tok") + else: + row.append("❌ N/A") + table_data.append(row) + + # Print the table + print(tabulate(table_data, headers=headers)) + + @click.command() @click.argument("eval_names_or_result_files", nargs=-1) @click.option( @@ -137,19 +195,13 @@ def main( results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")] eval_names = [f for f in eval_names_or_result_files if f not in results_files] - if results_files: - for results_file in results_files: - p = Path(results_file) - if p.exists(): - results = read_results_from_csv(str(p)) - print(f"\n{results_file}") - print(f"{'=' * len(results_file)}") - print_model_results(results) - print("\n=== Model Comparison ===") - print_model_results_table(results) - else: - print(f"Error: File {results_file} not found") - sys.exit(1) + if len(results_files) >= 2: + aggregate_and_display_results(results_files) + sys.exit(0) + elif results_files: + model_results = read_results_from_csv(results_files[0]) + print_model_results(model_results) + print_model_results_table(model_results) sys.exit(0) evals_to_run: list[EvalSpec] = [] @@ -243,7 +295,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]: def write_results(model_results: dict[str, list[EvalResult]]): - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%SZ") # get current commit hash and dirty status, like: a8b2ef0-dirty # TODO: don't assume we are in the gptme repo, use other version identifiers if available commit_hash = subprocess.run( @@ -296,10 +348,10 @@ def write_results(model_results: dict[str, list[EvalResult]]): "Model": model, "Test": result.name, "Passed": "true" if passed else "false", - "Total Duration": sum(result.timings.values()), - "Generation Time": result.timings["gen"], - "Run Time": result.timings["run"], - "Eval Time": result.timings["eval"], + "Total Duration": round(sum(result.timings.values()), 2), + "Generation Time": round(result.timings["gen"], 2), + "Run Time": round(result.timings["run"], 2), + "Eval Time": round(result.timings["eval"], 2), "Commit Hash": commit_hash, "Gen Stdout File": (test_dir_rel / "gen_stdout.txt"), "Gen Stderr File": (test_dir_rel / "gen_stderr.txt"), diff --git a/gptme/eval/run.py b/gptme/eval/run.py index 9a6b3ab2..1a38630e 100644 --- a/gptme/eval/run.py +++ b/gptme/eval/run.py @@ -147,7 +147,7 @@ def _handle_future(future: Future): process.join() model_results_final: dict[str, list[EvalResult]] = defaultdict(list) - for model in model_results: + for model in sorted(model_results): # sort results by test order model_results_final[model] = sorted( model_results[model].values(),