more benchmark fixes

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Jul 31, 2024 · a2d5f8b · a2d5f8b
1 parent b04e2c0
commit a2d5f8b
Showing 1 changed file with 25 additions and 6 deletions.
diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
@@ -34,6 +34,7 @@
 
 DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
 BENCHMARK_FILENAME = "benchmarks.csv"
+RAW_FILENAME = "raw_summary.csv"
 OUTLIERS_FILENAME = "outliers.csv"
 
 def plot_chart(ax, x, y, title, xlabel, ylabel):
@@ -82,8 +83,11 @@ def compare_results(df, ref, plot_columns, threshold_ratio=0.1):
     return outliers_df, outliers, charts
 
 
-def read_df(file_path, indices, plot_columns):
-    df = pd.read_csv(file_path)
+def read_df(file_path_or_dataframe, indices, plot_columns):
+    if isinstance(file_path_or_dataframe, str):
+        df = pd.read_csv(file_path_or_dataframe)
+    else:
+        df = file_path_or_dataframe
     df.set_index(indices, inplace=True)
     # all other columns not for plotting or explicitly ignored are hyperparameters
     argument_columns = [
@@ -98,10 +102,25 @@ def main(
     result_dir, reference_benchmark_filepath, plot_columns, threshold_ratio, indices
 ):
     ref, args_ref = read_df(reference_benchmark_filepath, indices, plot_columns)
-    new_benchmark_filepath = os.path.join(result_dir, BENCHMARK_FILENAME)
-    df, args_df = read_df(
-        new_benchmark_filepath, indices, plot_columns
-    )
+
+    # NOTE: this is a bit of a hack, if the new bench is a smaller bench, then we
+    # supplement the data from the raw summary
+    new_benchmark_filepath = os.path.join(result_dir, BENCHMARK_FILENAME) 
+    try:
+        df, args_df  = read_df(new_benchmark_filepath, indices, plot_columns)
+    except KeyError:
+        raw_filepath = os.path.join(result_dir, RAW_FILENAME)
+        print (
+            f"New '{new_benchmark_filepath}' is probably a partial bench. Supplementing "
+            f"missing columns from raw data '{raw_filepath}'."
+        )
+        df2 = pd.read_csv(new_benchmark_filepath)
+        df = pd.read_csv(raw_filepath)
+        df, args_df = read_df(
+            pd.concat([df, df2[[x for x in df2.columns if x not in df.columns]]], axis=1),
+            indices, plot_columns
+        )
+
     # Analyse between both sets of results and retrieve outliers
     # - this has a side effect of plotting the charts
     outliers_df, outliers, charts = compare_results(