From b39262910b05e2b0ea7de92e3759683c654c8920 Mon Sep 17 00:00:00 2001
From: Colin Smith <colin.smith@wisc.edu>
Date: Fri, 20 Dec 2024 16:58:52 -0500
Subject: [PATCH] feat: visualize similarity metrics by configuration

Implement a visualization to assess the accuracy of different OntoGPT
configurations relative to a baseline. Use a simple box plot to display
and compare configurations.
---
 src/spinneret/benchmark.py | 45 ++++++++++++++++++++++++++++++++++++++
 tests/test_benchmark.py    | 10 +++++++++
 2 files changed, 55 insertions(+)

diff --git a/src/spinneret/benchmark.py b/src/spinneret/benchmark.py
index d4f1e90..63f6236 100644
--- a/src/spinneret/benchmark.py
+++ b/src/spinneret/benchmark.py
@@ -564,3 +564,48 @@ def plot_similarity_scores_by_predicate(
     if output_file:
         plt.savefig(output_file, dpi=300)
     plt.show()
+
+
+def plot_similarity_scores_by_configuration(
+    benchmark_results: pd.DataFrame,
+    metric: str,
+    output_file: str = None,
+) -> None:
+    """
+    To see configuration level performance for an OntoGPT predicate
+
+    :param benchmark_results: The return value from the
+        `benchmark_against_standard` function.
+    :param metric: The metric to plot. This should be a column name from the
+        benchmark_results DataFrame, e.g. "average_score", "best_score", etc.
+    :param output_file: The path to save the plot to, as a PNG file.
+    :return: None
+    """
+    # Subset the benchmark results dataframe to only include the desired
+    # columns: test_dir, metric
+    df = benchmark_results[["test_dir", metric]]
+
+    # Remove empty rows where the metric is 0 or NaN to avoid plotting them
+    df = df.dropna(subset=[metric])
+    df = df[df[metric] != 0]
+
+    plt.figure(figsize=(10, 6))
+    grouped_data_long = df.groupby("test_dir")[metric].apply(list)
+    plt.boxplot(
+        grouped_data_long.values, labels=grouped_data_long.index, showmeans=True
+    )
+
+    # Add individual data points (jittered)
+    for i, group_data in enumerate(grouped_data_long):
+        x = np.random.normal(i + 1, 0.08, size=len(group_data))  # Jitter x-values
+        plt.plot(x, group_data, "o", alpha=0.25, color="grey")
+
+    plt.xlabel("Configuration")
+    plt.ylabel("Score")
+    title = f"Similarity Score '{metric}' Across Configurations"
+    plt.title(title)
+    plt.xticks(rotation=-20)
+    plt.tight_layout()
+    if output_file:
+        plt.savefig(output_file, dpi=300)
+    plt.show()
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index eb432f2..5890997 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -19,6 +19,7 @@
     get_grounding_rates,
     is_grounded,
     plot_similarity_scores_by_predicate,
+    plot_similarity_scores_by_configuration,
 )
 from spinneret.utilities import is_url
 
@@ -298,3 +299,12 @@ def test_plot_similarity_scores_by_predicate(termset_similarity_score_dataframe)
         test_dir_path="tests/data/benchmark/test_a",
         metric="average_score",
     )
+
+
+@pytest.mark.skip(reason="Manual inspection required")
+def test_plot_similarity_scores_by_configuration(termset_similarity_score_dataframe):
+    """Test the plot_similarity_scores_by_configuration function"""
+    plot_similarity_scores_by_configuration(
+        benchmark_results=termset_similarity_score_dataframe,
+        metric="average_score",
+    )