Lite Unify Metric Formatting (#803)

Striveworks · Oct 17, 2024 · 8aa7c02 · 8aa7c02
1 parent 19e639a
commit 8aa7c02
Show file tree

Hide file tree

Showing 45 changed files with 3,720 additions and 3,046 deletions.
diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.8"
+          python-version: "3.10"
       - name: Build wheel
         run: pip install build && python -m build
       - name: Publish to PyPI

diff --git a/lite/benchmarks/benchmark_classification.py b/lite/benchmarks/benchmark_classification.py
@@ -211,7 +211,7 @@ def run_benchmarking_analysis(
             )
 
         # evaluate
-        eval_time, _ = time_it(evaluator.compute_precision_recall)()
+        eval_time, _ = time_it(evaluator.compute_precision_recall_rocauc)()
         if eval_time > evaluation_timeout and evaluation_timeout != -1:
             raise TimeoutError(
                 f"Base evaluation timed out with {evaluator.n_datums} datums."

diff --git a/lite/examples/object-detection.ipynb b/lite/examples/object-detection.ipynb
diff --git a/lite/examples/tabular_classification.ipynb b/lite/examples/tabular_classification.ipynb
diff --git a/lite/tests/classification/test_accuracy.py b/lite/tests/classification/test_accuracy.py
@@ -3,7 +3,7 @@
     Classification,
     DataLoader,
     MetricType,
-    compute_metrics,
+    compute_precision_recall_rocauc,
 )
 
 
@@ -44,7 +44,7 @@ def test_accuracy_computation():
 
     score_thresholds = np.array([0.25, 0.75], dtype=np.float64)
 
-    (_, _, _, accuracy, _, _, _) = compute_metrics(
+    (_, _, _, accuracy, _, _, _) = compute_precision_recall_rocauc(
         data=data,
         label_metadata=label_metadata,
         score_thresholds=score_thresholds,
@@ -75,15 +75,23 @@ def test_accuracy_basic(basic_classifications: list[Classification]):
         "missing_prediction_labels": [],
     }
 
-    metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75], as_dict=True)
+    metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75])
 
-    actual_metrics = [m for m in metrics[MetricType.Accuracy]]
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
     expected_metrics = [
         {
             "type": "Accuracy",
-            "value": [2 / 3, 1 / 3],
+            "value": 2 / 3,
             "parameters": {
-                "score_thresholds": [0.25, 0.75],
+                "score_threshold": 0.25,
+                "hardmax": True,
+            },
+        },
+        {
+            "type": "Accuracy",
+            "value": 1 / 3,
+            "parameters": {
+                "score_threshold": 0.75,
                 "hardmax": True,
             },
         },
@@ -102,15 +110,15 @@ def test_accuracy_with_animal_example(
     loader.add_data(classifications_animal_example)
     evaluator = loader.finalize()
 
-    metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
+    metrics = evaluator.evaluate(score_thresholds=[0.5])
 
-    actual_metrics = [m for m in metrics[MetricType.Accuracy]]
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
     expected_metrics = [
         {
             "type": "Accuracy",
-            "value": [2.0 / 6.0],
+            "value": 2 / 6,
             "parameters": {
-                "score_thresholds": [0.5],
+                "score_threshold": 0.5,
                 "hardmax": True,
             },
         },
@@ -129,15 +137,15 @@ def test_accuracy_color_example(
     loader.add_data(classifications_color_example)
     evaluator = loader.finalize()
 
-    metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
+    metrics = evaluator.evaluate(score_thresholds=[0.5])
 
-    actual_metrics = [m for m in metrics[MetricType.Accuracy]]
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
     expected_metrics = [
         {
             "type": "Accuracy",
-            "value": [2 / 6],
+            "value": 2 / 6,
             "parameters": {
-                "score_thresholds": [0.5],
+                "score_threshold": 0.5,
                 "hardmax": True,
             },
         },
@@ -164,15 +172,15 @@ def test_accuracy_with_image_example(
         "missing_prediction_labels": [],
     }
 
-    metrics = evaluator.evaluate(as_dict=True)
+    metrics = evaluator.evaluate()
 
-    actual_metrics = [m for m in metrics[MetricType.Accuracy]]
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
     expected_metrics = [
         {
             "type": "Accuracy",
-            "value": [0.5],
+            "value": 0.5,
             "parameters": {
-                "score_thresholds": [0.0],
+                "score_threshold": 0.0,
                 "hardmax": True,
             },
         },
@@ -199,15 +207,15 @@ def test_accuracy_with_tabular_example(
         "missing_prediction_labels": [],
     }
 
-    metrics = evaluator.evaluate(as_dict=True)
+    metrics = evaluator.evaluate()
 
-    actual_metrics = [m for m in metrics[MetricType.Accuracy]]
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
     expected_metrics = [
         {
             "type": "Accuracy",
-            "value": [5 / 10],
+            "value": 0.5,
             "parameters": {
-                "score_thresholds": [0.0],
+                "score_threshold": 0.0,
                 "hardmax": True,
             },
         },

diff --git a/lite/tests/classification/test_confusion_matrix.py b/lite/tests/classification/test_confusion_matrix.py
@@ -117,9 +117,9 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
     actual_metrics = evaluator.compute_confusion_matrix(
         score_thresholds=[0.25, 0.75],
         number_of_examples=1,
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -146,7 +146,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
                 },
                 "missing_predictions": {},
             },
-            "parameters": {"score_threshold": 0.25},
+            "parameters": {
+                "score_threshold": 0.25,
+                "maximum_number_of_examples": 1,
+            },
         },
         {
             "type": "ConfusionMatrix",
@@ -167,7 +170,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
                     "3": {"count": 1, "examples": [{"datum": "uid2"}]}
                 },
             },
-            "parameters": {"score_threshold": 0.75},
+            "parameters": {
+                "score_threshold": 0.75,
+                "maximum_number_of_examples": 1,
+            },
         },
     ]
     for m in actual_metrics:
@@ -190,9 +196,9 @@ def test_confusion_matrix_unit(
 
     actual_metrics = evaluator.compute_confusion_matrix(
         score_thresholds=[0.5],
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -208,7 +214,10 @@ def test_confusion_matrix_unit(
                 },
                 "missing_predictions": {},
             },
-            "parameters": {"score_threshold": 0.5},
+            "parameters": {
+                "score_threshold": 0.5,
+                "maximum_number_of_examples": 0,
+            },
         },
     ]
     for m in actual_metrics:
@@ -232,9 +241,9 @@ def test_confusion_matrix_with_animal_example(
     actual_metrics = evaluator.compute_confusion_matrix(
         score_thresholds=[0.5],
         number_of_examples=6,
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -277,7 +286,10 @@ def test_confusion_matrix_with_animal_example(
                     "dog": {"count": 1, "examples": [{"datum": "uid5"}]}
                 },
             },
-            "parameters": {"score_threshold": 0.5},
+            "parameters": {
+                "score_threshold": 0.5,
+                "maximum_number_of_examples": 6,
+            },
         },
     ]
     for m in actual_metrics:
@@ -301,9 +313,9 @@ def test_confusion_matrix_with_color_example(
     actual_metrics = evaluator.compute_confusion_matrix(
         score_thresholds=[0.5],
         number_of_examples=6,
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -348,7 +360,10 @@ def test_confusion_matrix_with_color_example(
                     "red": {"count": 1, "examples": [{"datum": "uid2"}]}
                 },
             },
-            "parameters": {"score_threshold": 0.5},
+            "parameters": {
+                "score_threshold": 0.5,
+                "maximum_number_of_examples": 6,
+            },
         },
     ]
     for m in actual_metrics:
@@ -380,9 +395,9 @@ def test_confusion_matrix_multiclass(
     actual_metrics = evaluator.compute_confusion_matrix(
         score_thresholds=[0.05, 0.5, 0.85],
         number_of_examples=5,
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -427,6 +442,7 @@ def test_confusion_matrix_multiclass(
             },
             "parameters": {
                 "score_threshold": 0.05,
+                "maximum_number_of_examples": 5,
             },
         },
         {
@@ -458,7 +474,10 @@ def test_confusion_matrix_multiclass(
                     "bee": {"count": 1, "examples": [{"datum": "uid1"}]},
                 },
             },
-            "parameters": {"score_threshold": 0.5},
+            "parameters": {
+                "score_threshold": 0.5,
+                "maximum_number_of_examples": 5,
+            },
         },
         {
             "type": "ConfusionMatrix",
@@ -478,6 +497,7 @@ def test_confusion_matrix_multiclass(
             },
             "parameters": {
                 "score_threshold": 0.85,
+                "maximum_number_of_examples": 5,
             },
         },
     ]
@@ -511,9 +531,9 @@ def test_confusion_matrix_without_hardmax_animal_example(
         score_thresholds=[0.05, 0.4, 0.5],
         number_of_examples=6,
         hardmax=False,
-        as_dict=True,
     )
 
+    actual_metrics = [m.to_dict() for m in actual_metrics]
     expected_metrics = [
         {
             "type": "ConfusionMatrix",
@@ -542,7 +562,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
                 },
                 "missing_predictions": {},
             },
-            "parameters": {"score_threshold": 0.05},
+            "parameters": {
+                "score_threshold": 0.05,
+                "maximum_number_of_examples": 6,
+            },
         },
         {
             "type": "ConfusionMatrix",
@@ -559,7 +582,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
                 },
                 "missing_predictions": {},
             },
-            "parameters": {"score_threshold": 0.4},
+            "parameters": {
+                "score_threshold": 0.4,
+                "maximum_number_of_examples": 6,
+            },
         },
         {
             "type": "ConfusionMatrix",
@@ -576,7 +602,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
                     }
                 },
             },
-            "parameters": {"score_threshold": 0.5},
+            "parameters": {
+                "score_threshold": 0.5,
+                "maximum_number_of_examples": 6,
+            },
         },
     ]
     for m in actual_metrics: