From b39262910b05e2b0ea7de92e3759683c654c8920 Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Fri, 20 Dec 2024 16:58:52 -0500 Subject: [PATCH] feat: visualize similarity metrics by configuration Implement a visualization to assess the accuracy of different OntoGPT configurations relative to a baseline. Use a simple box plot to display and compare configurations. --- src/spinneret/benchmark.py | 45 ++++++++++++++++++++++++++++++++++++++ tests/test_benchmark.py | 10 +++++++++ 2 files changed, 55 insertions(+) diff --git a/src/spinneret/benchmark.py b/src/spinneret/benchmark.py index d4f1e90..63f6236 100644 --- a/src/spinneret/benchmark.py +++ b/src/spinneret/benchmark.py @@ -564,3 +564,48 @@ def plot_similarity_scores_by_predicate( if output_file: plt.savefig(output_file, dpi=300) plt.show() + + +def plot_similarity_scores_by_configuration( + benchmark_results: pd.DataFrame, + metric: str, + output_file: str = None, +) -> None: + """ + To see configuration level performance for an OntoGPT predicate + + :param benchmark_results: The return value from the + `benchmark_against_standard` function. + :param metric: The metric to plot. This should be a column name from the + benchmark_results DataFrame, e.g. "average_score", "best_score", etc. + :param output_file: The path to save the plot to, as a PNG file. + :return: None + """ + # Subset the benchmark results dataframe to only include the desired + # columns: test_dir, metric + df = benchmark_results[["test_dir", metric]] + + # Remove empty rows where the metric is 0 or NaN to avoid plotting them + df = df.dropna(subset=[metric]) + df = df[df[metric] != 0] + + plt.figure(figsize=(10, 6)) + grouped_data_long = df.groupby("test_dir")[metric].apply(list) + plt.boxplot( + grouped_data_long.values, labels=grouped_data_long.index, showmeans=True + ) + + # Add individual data points (jittered) + for i, group_data in enumerate(grouped_data_long): + x = np.random.normal(i + 1, 0.08, size=len(group_data)) # Jitter x-values + plt.plot(x, group_data, "o", alpha=0.25, color="grey") + + plt.xlabel("Configuration") + plt.ylabel("Score") + title = f"Similarity Score '{metric}' Across Configurations" + plt.title(title) + plt.xticks(rotation=-20) + plt.tight_layout() + if output_file: + plt.savefig(output_file, dpi=300) + plt.show() diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index eb432f2..5890997 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -19,6 +19,7 @@ get_grounding_rates, is_grounded, plot_similarity_scores_by_predicate, + plot_similarity_scores_by_configuration, ) from spinneret.utilities import is_url @@ -298,3 +299,12 @@ def test_plot_similarity_scores_by_predicate(termset_similarity_score_dataframe) test_dir_path="tests/data/benchmark/test_a", metric="average_score", ) + + +@pytest.mark.skip(reason="Manual inspection required") +def test_plot_similarity_scores_by_configuration(termset_similarity_score_dataframe): + """Test the plot_similarity_scores_by_configuration function""" + plot_similarity_scores_by_configuration( + benchmark_results=termset_similarity_score_dataframe, + metric="average_score", + )