feat: major improvments to evals, can now aggregate eval_results and …

…run rust+browser+git in Docker (passes evals)
ErikBjare · Sep 27, 2024 · bfd1b25 · bfd1b25
1 parent b0c2ae3
commit bfd1b25
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 26 deletions.
diff --git a/Dockerfile.eval b/Dockerfile.eval
@@ -1,24 +1,44 @@
 # Use the main Dockerfile as the base image
 FROM gptme:latest AS base
 
-# Set build argument for browser
-ARG BROWSER=no
+SHELL ["/bin/bash", "-c"]
+
+# Switch back to gptme directory (not /workspace)
+WORKDIR /app
+
+# Install rust if enabled
+ARG RUST=no
+USER appuser
+RUN if [ "$RUST" = "yes" ]; then \
+        curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal; \
+        source $HOME/.cargo/env; rustup default stable; \
+    fi
+ENV PATH="/home/appuser/.cargo/bin:${PATH}"
 
-# Switch back to root for installations
+# Install necessary build tools if rust is enabled
 USER root
+RUN if [ "$RUST" = "yes" ]; then \
+        apt-get update && apt-get install build-essential -y; \
+    fi
 
 # Install playwright if browser is enabled
+ARG BROWSER=no
+USER root
 RUN if [ "$BROWSER" = "yes" ]; then \
-        poetry run playwright install chromium; \
+        poetry install -E browser --without=dev; \
+        poetry run playwright install-deps; \
+        su appuser -c "poetry run playwright install chromium"; \
     fi
 
-WORKDIR /app
-
 # Create eval_results directory
 RUN mkdir ./eval_results; chown appuser:appuser ./eval_results
 
 # Switch back to non-root user
 USER appuser
 
+RUN git config --global user.name "gptme"
+RUN git config --global user.email "[email protected]"
+RUN git config --global init.defaultBranch main
+
 # Add an entry point for running evals
 ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"]
diff --git a/Makefile b/Makefile
@@ -18,6 +18,9 @@ build-docker:
 	docker build . -t gptme:latest -f Dockerfile
 	docker build . -t gptme-eval:latest -f Dockerfile.eval
 
+build-docker-full:
+	docker build . -t gptme-eval:latest -f Dockerfile.eval --build-arg RUST=yes --build-arg BROWSER=yes
+
 test:
 	@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
 	poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \

diff --git a/gptme/eval/main.py b/gptme/eval/main.py
@@ -10,7 +10,7 @@
 import sys
 from collections import defaultdict
 from collections.abc import Generator
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 
 import click
@@ -101,6 +101,64 @@ def print_model_results_table(model_results: dict[str, list[EvalResult]]):
     print(tabulate(table_data, headers=headers))
 
 
+def aggregate_and_display_results(result_files: list[str]):
+    all_results: dict[str, dict[str, dict]] = {}
+    for file in result_files:
+        for model, model_results in read_results_from_csv(file).items():
+            if model not in all_results:
+                all_results[model] = {}
+            for result in model_results:
+                if result.name not in all_results[model]:
+                    all_results[model][result.name] = {
+                        "total": 0,
+                        "passed": 0,
+                        "tokens": 0,
+                    }
+                all_results[model][result.name]["total"] += 1
+                all_results[model][result.name]["tokens"] += len_tokens(
+                    result.gen_stdout
+                ) + len_tokens(result.run_stdout)
+                if result.status == "success" and all(
+                    case.passed for case in result.results
+                ):
+                    all_results[model][result.name]["passed"] += 1
+
+    # Prepare table data
+    headers = ["Model"] + list(
+        set(
+            test
+            for model_results in all_results.values()
+            for test in model_results.keys()
+        )
+    )
+    table_data = []
+
+    def get_status_emoji(passed, total):
+        percentage = (passed / total) * 100
+        if percentage == 100:
+            return "✅"
+        elif 20 <= percentage < 80:
+            return "🔶"
+        else:
+            return "❌"
+
+    for model, results in all_results.items():
+        row = [model]
+        for test in headers[1:]:
+            if test in results:
+                passed = results[test]["passed"]
+                total = results[test]["total"]
+                tokens = results[test]["tokens"]
+                status_emoji = get_status_emoji(passed, total)
+                row.append(f"{status_emoji} {passed}/{total} {tokens}tok")
+            else:
+                row.append("❌ N/A")
+        table_data.append(row)
+
+    # Print the table
+    print(tabulate(table_data, headers=headers))
+
+
 @click.command()
 @click.argument("eval_names_or_result_files", nargs=-1)
 @click.option(
@@ -137,19 +195,13 @@ def main(
 
     results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
     eval_names = [f for f in eval_names_or_result_files if f not in results_files]
-    if results_files:
-        for results_file in results_files:
-            p = Path(results_file)
-            if p.exists():
-                results = read_results_from_csv(str(p))
-                print(f"\n{results_file}")
-                print(f"{'=' * len(results_file)}")
-                print_model_results(results)
-                print("\n=== Model Comparison ===")
-                print_model_results_table(results)
-            else:
-                print(f"Error: File {results_file} not found")
-                sys.exit(1)
+    if len(results_files) >= 2:
+        aggregate_and_display_results(results_files)
+        sys.exit(0)
+    elif results_files:
+        model_results = read_results_from_csv(results_files[0])
+        print_model_results(model_results)
+        print_model_results_table(model_results)
         sys.exit(0)
 
     evals_to_run: list[EvalSpec] = []
@@ -243,7 +295,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:
 
 
 def write_results(model_results: dict[str, list[EvalResult]]):
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%SZ")
     # get current commit hash and dirty status, like: a8b2ef0-dirty
     # TODO: don't assume we are in the gptme repo, use other version identifiers if available
     commit_hash = subprocess.run(
@@ -296,10 +348,10 @@ def write_results(model_results: dict[str, list[EvalResult]]):
                     "Model": model,
                     "Test": result.name,
                     "Passed": "true" if passed else "false",
-                    "Total Duration": sum(result.timings.values()),
-                    "Generation Time": result.timings["gen"],
-                    "Run Time": result.timings["run"],
-                    "Eval Time": result.timings["eval"],
+                    "Total Duration": round(sum(result.timings.values()), 2),
+                    "Generation Time": round(result.timings["gen"], 2),
+                    "Run Time": round(result.timings["run"], 2),
+                    "Eval Time": round(result.timings["eval"], 2),
                     "Commit Hash": commit_hash,
                     "Gen Stdout File": (test_dir_rel / "gen_stdout.txt"),
                     "Gen Stderr File": (test_dir_rel / "gen_stderr.txt"),

diff --git a/gptme/eval/run.py b/gptme/eval/run.py
@@ -147,7 +147,7 @@ def _handle_future(future: Future):
         process.join()
 
     model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
-    for model in model_results:
+    for model in sorted(model_results):
         # sort results by test order
         model_results_final[model] = sorted(
             model_results[model].values(),