Skip to content

Commit

Permalink
feat: major improvments to evals, can now aggregate eval_results and …
Browse files Browse the repository at this point in the history
…run rust+browser+git in Docker (passes evals)
  • Loading branch information
ErikBjare committed Sep 27, 2024
1 parent b0c2ae3 commit bfd1b25
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 26 deletions.
32 changes: 26 additions & 6 deletions Dockerfile.eval
Original file line number Diff line number Diff line change
@@ -1,24 +1,44 @@
# Use the main Dockerfile as the base image
FROM gptme:latest AS base

# Set build argument for browser
ARG BROWSER=no
SHELL ["/bin/bash", "-c"]

# Switch back to gptme directory (not /workspace)
WORKDIR /app

# Install rust if enabled
ARG RUST=no
USER appuser
RUN if [ "$RUST" = "yes" ]; then \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal; \
source $HOME/.cargo/env; rustup default stable; \
fi
ENV PATH="/home/appuser/.cargo/bin:${PATH}"

# Switch back to root for installations
# Install necessary build tools if rust is enabled
USER root
RUN if [ "$RUST" = "yes" ]; then \
apt-get update && apt-get install build-essential -y; \
fi

# Install playwright if browser is enabled
ARG BROWSER=no
USER root
RUN if [ "$BROWSER" = "yes" ]; then \
poetry run playwright install chromium; \
poetry install -E browser --without=dev; \
poetry run playwright install-deps; \
su appuser -c "poetry run playwright install chromium"; \
fi

WORKDIR /app

# Create eval_results directory
RUN mkdir ./eval_results; chown appuser:appuser ./eval_results

# Switch back to non-root user
USER appuser

RUN git config --global user.name "gptme"
RUN git config --global user.email "[email protected]"
RUN git config --global init.defaultBranch main

# Add an entry point for running evals
ENTRYPOINT ["poetry", "run", "python", "-m", "gptme.eval"]
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ build-docker:
docker build . -t gptme:latest -f Dockerfile
docker build . -t gptme-eval:latest -f Dockerfile.eval

build-docker-full:
docker build . -t gptme-eval:latest -f Dockerfile.eval --build-arg RUST=yes --build-arg BROWSER=yes

test:
@# if SLOW is not set, pass `-m "not slow"` to skip slow tests
poetry run pytest ${SRCDIRS} -v --log-level INFO --durations=5 \
Expand Down
90 changes: 71 additions & 19 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
from collections import defaultdict
from collections.abc import Generator
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path

import click
Expand Down Expand Up @@ -101,6 +101,64 @@ def print_model_results_table(model_results: dict[str, list[EvalResult]]):
print(tabulate(table_data, headers=headers))


def aggregate_and_display_results(result_files: list[str]):
all_results: dict[str, dict[str, dict]] = {}
for file in result_files:
for model, model_results in read_results_from_csv(file).items():
if model not in all_results:
all_results[model] = {}
for result in model_results:
if result.name not in all_results[model]:
all_results[model][result.name] = {
"total": 0,
"passed": 0,
"tokens": 0,
}
all_results[model][result.name]["total"] += 1
all_results[model][result.name]["tokens"] += len_tokens(
result.gen_stdout
) + len_tokens(result.run_stdout)
if result.status == "success" and all(
case.passed for case in result.results
):
all_results[model][result.name]["passed"] += 1

# Prepare table data
headers = ["Model"] + list(
set(
test
for model_results in all_results.values()
for test in model_results.keys()
)
)
table_data = []

def get_status_emoji(passed, total):
percentage = (passed / total) * 100
if percentage == 100:
return "✅"
elif 20 <= percentage < 80:
return "🔶"
else:
return "❌"

for model, results in all_results.items():
row = [model]
for test in headers[1:]:
if test in results:
passed = results[test]["passed"]
total = results[test]["total"]
tokens = results[test]["tokens"]
status_emoji = get_status_emoji(passed, total)
row.append(f"{status_emoji} {passed}/{total} {tokens}tok")
else:
row.append("❌ N/A")
table_data.append(row)

# Print the table
print(tabulate(table_data, headers=headers))


@click.command()
@click.argument("eval_names_or_result_files", nargs=-1)
@click.option(
Expand Down Expand Up @@ -137,19 +195,13 @@ def main(

results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
eval_names = [f for f in eval_names_or_result_files if f not in results_files]
if results_files:
for results_file in results_files:
p = Path(results_file)
if p.exists():
results = read_results_from_csv(str(p))
print(f"\n{results_file}")
print(f"{'=' * len(results_file)}")
print_model_results(results)
print("\n=== Model Comparison ===")
print_model_results_table(results)
else:
print(f"Error: File {results_file} not found")
sys.exit(1)
if len(results_files) >= 2:
aggregate_and_display_results(results_files)
sys.exit(0)
elif results_files:
model_results = read_results_from_csv(results_files[0])
print_model_results(model_results)
print_model_results_table(model_results)
sys.exit(0)

evals_to_run: list[EvalSpec] = []
Expand Down Expand Up @@ -243,7 +295,7 @@ def read_results_from_csv(filename: str) -> dict[str, list[EvalResult]]:


def write_results(model_results: dict[str, list[EvalResult]]):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%SZ")
# get current commit hash and dirty status, like: a8b2ef0-dirty
# TODO: don't assume we are in the gptme repo, use other version identifiers if available
commit_hash = subprocess.run(
Expand Down Expand Up @@ -296,10 +348,10 @@ def write_results(model_results: dict[str, list[EvalResult]]):
"Model": model,
"Test": result.name,
"Passed": "true" if passed else "false",
"Total Duration": sum(result.timings.values()),
"Generation Time": result.timings["gen"],
"Run Time": result.timings["run"],
"Eval Time": result.timings["eval"],
"Total Duration": round(sum(result.timings.values()), 2),
"Generation Time": round(result.timings["gen"], 2),
"Run Time": round(result.timings["run"], 2),
"Eval Time": round(result.timings["eval"], 2),
"Commit Hash": commit_hash,
"Gen Stdout File": (test_dir_rel / "gen_stdout.txt"),
"Gen Stderr File": (test_dir_rel / "gen_stderr.txt"),
Expand Down
2 changes: 1 addition & 1 deletion gptme/eval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _handle_future(future: Future):
process.join()

model_results_final: dict[str, list[EvalResult]] = defaultdict(list)
for model in model_results:
for model in sorted(model_results):
# sort results by test order
model_results_final[model] = sorted(
model_results[model].values(),
Expand Down

0 comments on commit bfd1b25

Please sign in to comment.