fix: refactored evals, fixed leaked semaphore warnings, read logs and…

… naively compute tokens from output
ErikBjare · Sep 18, 2024 · 3bbd88b · 3bbd88b
1 parent 8887ca8
commit 3bbd88b
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 88 deletions.
diff --git a/gptme/eval/main.py b/gptme/eval/main.py
@@ -7,6 +7,7 @@
 import csv
 import logging
 import multiprocessing
+import multiprocessing.resource_tracker
 import subprocess
 import sys
 from collections import defaultdict
@@ -17,6 +18,7 @@
 import multiprocessing_logging
 from tabulate import tabulate
 
+from ..message import len_tokens
 from .run import run_evals
 from .suites import suites, tests_default, tests_map
 from .types import ExecResult, ExecTest
@@ -29,41 +31,61 @@
 #     message=r"resource_tracker:.*",
 # )
 
-
 # Configure logging, including fully-qualified module names
 logging.basicConfig(
     level=logging.INFO,
     # helpful in debugging: %(processName)s
     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 )
-multiprocessing_logging.install_mp_handler()
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 
 project_dir = Path(__file__).parent.parent.parent
 
-
 def print_model_results(model_results: dict[str, list[ExecResult]]):
+    total_tests = 0
+    total_duration = 0.0
+    total_tokens = 0
+
     for model, results in model_results.items():
         print(f"\nResults for model: {model}")
         duration_total = sum(
             result.timings["gen"] + result.timings["run"] + result.timings["eval"]
             for result in results
         )
-        print(f"Completed {len(results)} tests in {duration_total:.2f}s:")
+        model_total_tokens = sum(
+            len_tokens(result.gen_stdout) + len_tokens(result.run_stdout)
+            for result in results
+        )
+        print(
+            f"Completed {len(results)} tests in {duration_total:.2f}s/{model_total_tokens}tok:"
+        )
         for result in results:
             cases = result.results
             checkmark = "✅" if cases and all(case.passed for case in cases) else "❌"
             duration_result = (
                 result.timings["gen"] + result.timings["run"] + result.timings["eval"]
             )
+            gen_tokens = len_tokens(result.gen_stdout)
+            run_tokens = len_tokens(result.run_stdout)
+            result_total_tokens = gen_tokens + run_tokens
             print(
-                f"{checkmark} {result.name} in {duration_result:.2f}s (gen: {result.timings['gen']:.2f}s, run: {result.timings['run']:.2f}s, eval: {result.timings['eval']:.2f}s)"
+                f"{checkmark} {result.name}: {duration_result:.2f}s/{result_total_tokens}tok "
+                f"(gen: {result.timings['gen']:.2f}s/{gen_tokens}tok, "
+                f"run: {result.timings['run']:.2f}s/{run_tokens}tok, "
+                f"eval: {result.timings['eval']:.2f}s)"
             )
             for case in cases:
                 checkmark = "✅" if case.passed else "❌"
                 print(f"   {checkmark} {case.name}")
 
+        total_tests += len(results)
+        total_duration += duration_total
+        total_tokens += model_total_tokens
+
+    print("\nTotal across all models:")
+    print(f"Completed {total_tests} tests in {total_duration:.2f}s/{total_tokens}tok")
+
 
 def print_model_results_table(model_results: dict[str, list[ExecResult]]):
     test_names = {
@@ -80,11 +102,15 @@ def print_model_results_table(model_results: dict[str, list[ExecResult]]):
                 passed = all(case.passed for case in result.results)
                 checkmark = "✅" if result.status == "success" and passed else "❌"
                 duration = sum(result.timings.values())
+                gen_tokens = len_tokens(result.gen_stdout)
+                run_tokens = len_tokens(result.run_stdout)
                 reason = "timeout" if result.status == "timeout" else ""
                 if reason:
                     row.append(f"{checkmark} {reason}")
                 else:
-                    row.append(f"{checkmark} {duration:.2f}s")
+                    row.append(
+                        f"{checkmark} {duration:.2f}s/{gen_tokens+run_tokens}tok"
+                    )
             except StopIteration:
                 row.append("❌ N/A")
         table_data.append(row)
@@ -114,31 +140,34 @@ def main(
 
     Pass test names to run, or result files to print.
     """
+    # init
+    multiprocessing_logging.install_mp_handler()
+
     models = _model or [
         "openai/gpt-4o",
         "anthropic/claude-3-5-sonnet-20240620",
         "openrouter/meta-llama/llama-3.1-405b-instruct",
     ]
 
     results_files = [f for f in eval_names_or_result_files if f.endswith(".csv")]
-    for results_file in results_files:
-        p = Path(results_file)
-        if p.exists():
-            results = read_results_from_csv(str(p))
-            print(f"\n{results_file}")
-            print(f"{'=' * len(results_file)}")
-            print_model_results_table(results)
-        else:
-            print(f"Error: File {results_file} not found")
-            sys.exit(1)
     if results_files:
+        for results_file in results_files:
+            p = Path(results_file)
+            if p.exists():
+                results = read_results_from_csv(str(p))
+                print(f"\n{results_file}")
+                print(f"{'=' * len(results_file)}")
+                print_model_results(results)
+                print("\n=== Model Comparison ===")
+                print_model_results_table(results)
+            else:
+                print(f"Error: File {results_file} not found")
+                sys.exit(1)
         sys.exit(0)
 
     tests_to_run: list[ExecTest] = []
     for test_name in eval_names_or_result_files:
-        if test_name in results_files:
-            continue
-        elif test_name in tests_map:
+        if test_name in tests_map:
             tests_to_run.append(tests_map[test_name])
         elif test_name in suites:
             tests_to_run.extend(suites[test_name])
@@ -164,12 +193,22 @@ def main(
     sys.exit(0)
 
 
+def read_log_file(file_path: Path) -> str:
+    if file_path.exists():
+        with open(file_path) as f:
+            return f.read()
+    return ""
+
+
 def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
     model_results = defaultdict(list)
+    results_dir = Path(filename).parent
     with open(filename, newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
             model = row["Model"]
+            test_dir = results_dir / model / row["Test"]
+
             result = ExecResult(
                 name=row["Test"],
                 status="success" if row["Passed"] == "true" else "error",
@@ -179,10 +218,10 @@ def read_results_from_csv(filename: str) -> dict[str, list[ExecResult]]:
                     "run": float(row["Run Time"]),
                     "eval": float(row["Eval Time"]),
                 },
-                stdout="",  # We don't have stdout in the CSV
-                stderr="",  # We don't have stderr in the CSV
-                run_stdout="",  # We don't have run_stdout in the CSV
-                run_stderr="",  # We don't have run_stderr in the CSV
+                gen_stdout=read_log_file(test_dir / "gen_stdout.txt"),
+                gen_stderr=read_log_file(test_dir / "gen_stderr.txt"),
+                run_stdout=read_log_file(test_dir / "run_stdout.txt"),
+                run_stderr=read_log_file(test_dir / "run_stderr.txt"),
             )
             model_results[model].append(result)
     return dict(model_results)
@@ -240,9 +279,9 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
                 run_stderr_file = test_dir / "run_stderr.txt"
 
                 with open(gen_stdout_file, "w") as f:
-                    f.write(result.stdout)
+                    f.write(result.gen_stdout)
                 with open(gen_stderr_file, "w") as f:
-                    f.write(result.stderr)
+                    f.write(result.gen_stderr)
                 with open(run_stdout_file, "w") as f:
                     f.write(result.run_stdout)
                 with open(run_stderr_file, "w") as f:
@@ -290,7 +329,7 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
         writer.writeheader()
         for model, results in model_results.items():
             for result in results:
-                # Needs to pass all checks, and needs to have results (not empty, as in case of timeout)
+                # Needs to pass all checks, and needs to have results (not empty, as in case of error/timeout)
                 passed = (
                     all(case.passed for case in result.results)
                     if result.results
@@ -302,19 +341,11 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
                 test_dir.mkdir(parents=True, exist_ok=True)
 
                 # Save each stream to a separate file
-                gen_stdout_file = test_dir / "gen_stdout.txt"
-                gen_stderr_file = test_dir / "gen_stderr.txt"
-                run_stdout_file = test_dir / "run_stdout.txt"
-                run_stderr_file = test_dir / "run_stderr.txt"
-
-                with open(gen_stdout_file, "w") as f:
-                    f.write(result.stdout)
-                with open(gen_stderr_file, "w") as f:
-                    f.write(result.stderr)
-                with open(run_stdout_file, "w") as f:
-                    f.write(result.run_stdout)
-                with open(run_stderr_file, "w") as f:
-                    f.write(result.run_stderr)
+                streams = ["gen_stdout", "gen_stderr", "run_stdout", "run_stderr"]
+                for stream in streams:
+                    stream_file = test_dir / f"{stream}.txt"
+                    with open(stream_file, "w") as f:
+                        f.write(getattr(result, stream))
 
                 writer.writerow(
                     {
@@ -326,18 +357,8 @@ def write_results_to_csv(model_results: dict[str, list[ExecResult]]):
                         "Run Time": result.timings["run"],
                         "Eval Time": result.timings["eval"],
                         "Commit Hash": commit_hash,
-                        "Gen Stdout File": gen_stdout_file.relative_to(results_dir),
-                        "Gen Stderr File": gen_stderr_file.relative_to(results_dir),
-                        "Run Stdout File": run_stdout_file.relative_to(results_dir),
-                        "Run Stderr File": run_stderr_file.relative_to(results_dir),
                     }
                 )
 
     print(f"\nResults saved to {csv_filename.resolve()}")
     print(f"Output files saved in {results_dir.resolve()}")
-
-
-if __name__ == "__main__":
-    # This ensures compatibility across platforms
-    multiprocessing.set_start_method("spawn")
-    main()