Skip to content

Commit

Permalink
fix: refactored evals, fixed leaked semaphore warnings, read logs and…
Browse files Browse the repository at this point in the history
… naively compute tokens from output
  • Loading branch information
ErikBjare committed Sep 18, 2024
1 parent 8887ca8 commit 3bbd88b
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 88 deletions.
119 changes: 70 additions & 49 deletions gptme/eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import csv
import logging
import multiprocessing
import multiprocessing.resource_tracker
import subprocess
import sys
from collections import defaultdict
Expand All @@ -17,6 +18,7 @@
import multiprocessing_logging
from tabulate import tabulate

from ..message import len_tokens
from .run import run_evals
from .suites import suites, tests_default, tests_map
from .types import ExecResult, ExecTest
Expand All @@ -29,41 +31,61 @@
# message=r"resource_tracker:.*",
# )


# Configure logging, including fully-qualified module names
logging.basicConfig(
level=logging.INFO,
# helpful in debugging: %(processName)s
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
multiprocessing_logging.install_mp_handler()
logging.getLogger("httpx").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

project_dir = Path(__file__).parent.parent.parent


def print_model_results(model_results: dict[str, list[ExecResult]]):
total_tests = 0
total_duration = 0.0
total_tokens = 0

for model, results in model_results.items():
print(f"\nResults for model: {model}")
duration_total = sum(
result.timings["gen"] + result.timings["run"] + result.timings["eval"]
for result in results
)
print(f"Completed {len(results)} tests in {duration_total:.2f}s:")
model_total_tokens = sum(
len_tokens(result.gen_stdout) + len_tokens(result.run_stdout)
for result in results
)
print(
f"Completed {len(results)} tests in {duration_total:.2f}s/{model_total_tokens}tok:"
)
for result in results:
cases = result.results
checkmark = "✅" if cases and all(case.passed for case in cases) else "❌"
duration_result = (
result.timings["gen"] + result.timings["run"] + result.timings["eval"]
)
gen_tokens = len_tokens(result.gen_stdout)
run_tokens = len_tokens(result.run_stdout)
result_total_tokens = gen_tokens + run_tokens
print(
f"{checkmark} {result.name} in {duration_result:.2f}s (gen: {result.timings['gen']:.2f}s, run: {result.timings['run']:.2f}s, eval: {result.timings['eval']:.2f}s)"
f"{checkmark} {result.name}: {duration_result:.2f}s/{result_total_tokens}tok "
f"(gen: {result.timings['gen']:.2f}s/{gen_tokens}tok, "
f"run: {result.timings['run']:.2f}s/{run_tokens}tok, "
f"eval: {result.timings['eval']:.2f}s)"
)
for case in cases:
checkmark = "✅" if case.passed else "❌"
print(f" {checkmark} {case.name}")

total_tests += len(results)
total_duration += duration_total
total_tokens += model_total_tokens

print("\nTotal across all models:")
print(f"Completed {total_tests} tests in {total_duration:.2f}s/{total_tokens}tok")


def print_model_results_table(model_results: dict[str, list[ExecResult]]):
test_names = {
Expand All @@ -80,11 +102,15 @@ def print_model_results_table(model_results: dict[str, list[ExecResult]]):
passed = all(case.passed for case in result.results)
checkmark = "✅" if result.status == "success" and passed else "❌"
duration = sum(result.timings.values())
gen_tokens = len_tokens(result.gen_stdout)
run_tokens = len_tokens(result.run_stdout)
reason = "timeout" if result.status == "timeout" else ""
if reason:
row.append(f"{checkmark} {reason}")
else:
row.append(f"{checkmark} {duration:.2f}s")
row.append(
f"{checkmark} {duration:.2f}s/{gen_tokens+run_tokens}tok"
)
except StopIteration:
row.append("❌ N/A")
table_data.append(row)
Expand Down Expand Up @@ -114,31 +140,34 @@ def main(
Pass test names to run, or result files to print.
"""
# init
multiprocessing_logging.install_mp_handler()

models = _model or [
"openai/gpt-4o",
"anthropic/claude-3-5-sonnet-20240620",
"openrouter/meta-llama/llama-3.1-405b-instruct",
]

results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
for results_file in results_files:
p = Path(results_file)
if p.exists():
results = read_results_from_csv(str(p))
print(f"\n{results_file}")
print(f"{'=' * len(results_file)}")
print_model_results_table(results)
else:
print(f"Error: File {results_file} not found")
sys.exit(1)
if results_files:
for results_file in results_files:
p = Path(results_file)
if p.exists():
results = read_results_from_csv(str(p))
print(f"\n{results_file}")
print(f"{'=' * len(results_file)}")
print_model_results(results)
print("\n=== Model Comparison ===")
print_model_results_table(results)
else:
print(f"Error: File {results_file} not found")
sys.exit(1)
sys.exit(0)

tests_to_run: list[ExecTest] = []
for test_name in eval_names_or_result_files:
if test_name in results_files:
continue
elif test_name in tests_map:
if test_name in tests_map:
tests_to_run.append(tests_map[test_name])
elif test_name in suites:
tests_to_run.extend(suites[test_name])
Expand All @@ -164,12 +193,22 @@ def main(
sys.exit(0)


def read_log_file(file_path: Path) -> str:
if file_path.exists():
with open(file_path) as f:
return f.read()
return ""


def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
model_results = defaultdict(list)
results_dir = Path(filename).parent
with open(filename, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
model = row["Model"]
test_dir = results_dir / model / row["Test"]

result = ExecResult(
name=row["Test"],
status="success" if row["Passed"] == "true" else "error",
Expand All @@ -179,10 +218,10 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
"run": float(row["Run Time"]),
"eval": float(row["Eval Time"]),
},
stdout="", # We don't have stdout in the CSV
stderr="", # We don't have stderr in the CSV
run_stdout="", # We don't have run_stdout in the CSV
run_stderr="", # We don't have run_stderr in the CSV
gen_stdout=read_log_file(test_dir / "gen_stdout.txt"),
gen_stderr=read_log_file(test_dir / "gen_stderr.txt"),
run_stdout=read_log_file(test_dir / "run_stdout.txt"),
run_stderr=read_log_file(test_dir / "run_stderr.txt"),
)
model_results[model].append(result)
return dict(model_results)
Expand Down Expand Up @@ -240,9 +279,9 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
run_stderr_file = test_dir / "run_stderr.txt"

with open(gen_stdout_file, "w") as f:
f.write(result.stdout)
f.write(result.gen_stdout)
with open(gen_stderr_file, "w") as f:
f.write(result.stderr)
f.write(result.gen_stderr)
with open(run_stdout_file, "w") as f:
f.write(result.run_stdout)
with open(run_stderr_file, "w") as f:
Expand Down Expand Up @@ -290,7 +329,7 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
writer.writeheader()
for model, results in model_results.items():
for result in results:
# Needs to pass all checks, and needs to have results (not empty, as in case of timeout)
# Needs to pass all checks, and needs to have results (not empty, as in case of error/timeout)
passed = (
all(case.passed for case in result.results)
if result.results
Expand All @@ -302,19 +341,11 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
test_dir.mkdir(parents=True, exist_ok=True)

# Save each stream to a separate file
gen_stdout_file = test_dir / "gen_stdout.txt"
gen_stderr_file = test_dir / "gen_stderr.txt"
run_stdout_file = test_dir / "run_stdout.txt"
run_stderr_file = test_dir / "run_stderr.txt"

with open(gen_stdout_file, "w") as f:
f.write(result.stdout)
with open(gen_stderr_file, "w") as f:
f.write(result.stderr)
with open(run_stdout_file, "w") as f:
f.write(result.run_stdout)
with open(run_stderr_file, "w") as f:
f.write(result.run_stderr)
streams = ["gen_stdout", "gen_stderr", "run_stdout", "run_stderr"]
for stream in streams:
stream_file = test_dir / f"{stream}.txt"
with open(stream_file, "w") as f:
f.write(getattr(result, stream))

writer.writerow(
{
Expand All @@ -326,18 +357,8 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
"Run Time": result.timings["run"],
"Eval Time": result.timings["eval"],
"Commit Hash": commit_hash,
"Gen Stdout File": gen_stdout_file.relative_to(results_dir),
"Gen Stderr File": gen_stderr_file.relative_to(results_dir),
"Run Stdout File": run_stdout_file.relative_to(results_dir),
"Run Stderr File": run_stderr_file.relative_to(results_dir),
}
)

print(f"\nResults saved to {csv_filename.resolve()}")
print(f"Output files saved in {results_dir.resolve()}")


if __name__ == "__main__":
# This ensures compatibility across platforms
multiprocessing.set_start_method("spawn")
main()
Loading

0 comments on commit 3bbd88b

Please sign in to comment.