Skip to content

Commit

Permalink
Lite Unify Metric Formatting (#803)
Browse files Browse the repository at this point in the history
  • Loading branch information
czaloom authored Oct 17, 2024
1 parent 19e639a commit 8aa7c02
Show file tree
Hide file tree
Showing 45 changed files with 3,720 additions and 3,046 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-and-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.10"
- name: Build wheel
run: pip install build && python -m build
- name: Publish to PyPI
Expand Down
2 changes: 1 addition & 1 deletion lite/benchmarks/benchmark_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def run_benchmarking_analysis(
)

# evaluate
eval_time, _ = time_it(evaluator.compute_precision_recall)()
eval_time, _ = time_it(evaluator.compute_precision_recall_rocauc)()
if eval_time > evaluation_timeout and evaluation_timeout != -1:
raise TimeoutError(
f"Base evaluation timed out with {evaluator.n_datums} datums."
Expand Down
186 changes: 95 additions & 91 deletions lite/examples/object-detection.ipynb

Large diffs are not rendered by default.

101 changes: 58 additions & 43 deletions lite/examples/tabular_classification.ipynb

Large diffs are not rendered by default.

52 changes: 30 additions & 22 deletions lite/tests/classification/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Classification,
DataLoader,
MetricType,
compute_metrics,
compute_precision_recall_rocauc,
)


Expand Down Expand Up @@ -44,7 +44,7 @@ def test_accuracy_computation():

score_thresholds = np.array([0.25, 0.75], dtype=np.float64)

(_, _, _, accuracy, _, _, _) = compute_metrics(
(_, _, _, accuracy, _, _, _) = compute_precision_recall_rocauc(
data=data,
label_metadata=label_metadata,
score_thresholds=score_thresholds,
Expand Down Expand Up @@ -75,15 +75,23 @@ def test_accuracy_basic(basic_classifications: list[Classification]):
"missing_prediction_labels": [],
}

metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75], as_dict=True)
metrics = evaluator.evaluate(score_thresholds=[0.25, 0.75])

actual_metrics = [m for m in metrics[MetricType.Accuracy]]
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
expected_metrics = [
{
"type": "Accuracy",
"value": [2 / 3, 1 / 3],
"value": 2 / 3,
"parameters": {
"score_thresholds": [0.25, 0.75],
"score_threshold": 0.25,
"hardmax": True,
},
},
{
"type": "Accuracy",
"value": 1 / 3,
"parameters": {
"score_threshold": 0.75,
"hardmax": True,
},
},
Expand All @@ -102,15 +110,15 @@ def test_accuracy_with_animal_example(
loader.add_data(classifications_animal_example)
evaluator = loader.finalize()

metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
metrics = evaluator.evaluate(score_thresholds=[0.5])

actual_metrics = [m for m in metrics[MetricType.Accuracy]]
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
expected_metrics = [
{
"type": "Accuracy",
"value": [2.0 / 6.0],
"value": 2 / 6,
"parameters": {
"score_thresholds": [0.5],
"score_threshold": 0.5,
"hardmax": True,
},
},
Expand All @@ -129,15 +137,15 @@ def test_accuracy_color_example(
loader.add_data(classifications_color_example)
evaluator = loader.finalize()

metrics = evaluator.evaluate(score_thresholds=[0.5], as_dict=True)
metrics = evaluator.evaluate(score_thresholds=[0.5])

actual_metrics = [m for m in metrics[MetricType.Accuracy]]
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
expected_metrics = [
{
"type": "Accuracy",
"value": [2 / 6],
"value": 2 / 6,
"parameters": {
"score_thresholds": [0.5],
"score_threshold": 0.5,
"hardmax": True,
},
},
Expand All @@ -164,15 +172,15 @@ def test_accuracy_with_image_example(
"missing_prediction_labels": [],
}

metrics = evaluator.evaluate(as_dict=True)
metrics = evaluator.evaluate()

actual_metrics = [m for m in metrics[MetricType.Accuracy]]
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
expected_metrics = [
{
"type": "Accuracy",
"value": [0.5],
"value": 0.5,
"parameters": {
"score_thresholds": [0.0],
"score_threshold": 0.0,
"hardmax": True,
},
},
Expand All @@ -199,15 +207,15 @@ def test_accuracy_with_tabular_example(
"missing_prediction_labels": [],
}

metrics = evaluator.evaluate(as_dict=True)
metrics = evaluator.evaluate()

actual_metrics = [m for m in metrics[MetricType.Accuracy]]
actual_metrics = [m.to_dict() for m in metrics[MetricType.Accuracy]]
expected_metrics = [
{
"type": "Accuracy",
"value": [5 / 10],
"value": 0.5,
"parameters": {
"score_thresholds": [0.0],
"score_threshold": 0.0,
"hardmax": True,
},
},
Expand Down
59 changes: 44 additions & 15 deletions lite/tests/classification/test_confusion_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
actual_metrics = evaluator.compute_confusion_matrix(
score_thresholds=[0.25, 0.75],
number_of_examples=1,
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand All @@ -146,7 +146,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
},
"missing_predictions": {},
},
"parameters": {"score_threshold": 0.25},
"parameters": {
"score_threshold": 0.25,
"maximum_number_of_examples": 1,
},
},
{
"type": "ConfusionMatrix",
Expand All @@ -167,7 +170,10 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]):
"3": {"count": 1, "examples": [{"datum": "uid2"}]}
},
},
"parameters": {"score_threshold": 0.75},
"parameters": {
"score_threshold": 0.75,
"maximum_number_of_examples": 1,
},
},
]
for m in actual_metrics:
Expand All @@ -190,9 +196,9 @@ def test_confusion_matrix_unit(

actual_metrics = evaluator.compute_confusion_matrix(
score_thresholds=[0.5],
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand All @@ -208,7 +214,10 @@ def test_confusion_matrix_unit(
},
"missing_predictions": {},
},
"parameters": {"score_threshold": 0.5},
"parameters": {
"score_threshold": 0.5,
"maximum_number_of_examples": 0,
},
},
]
for m in actual_metrics:
Expand All @@ -232,9 +241,9 @@ def test_confusion_matrix_with_animal_example(
actual_metrics = evaluator.compute_confusion_matrix(
score_thresholds=[0.5],
number_of_examples=6,
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand Down Expand Up @@ -277,7 +286,10 @@ def test_confusion_matrix_with_animal_example(
"dog": {"count": 1, "examples": [{"datum": "uid5"}]}
},
},
"parameters": {"score_threshold": 0.5},
"parameters": {
"score_threshold": 0.5,
"maximum_number_of_examples": 6,
},
},
]
for m in actual_metrics:
Expand All @@ -301,9 +313,9 @@ def test_confusion_matrix_with_color_example(
actual_metrics = evaluator.compute_confusion_matrix(
score_thresholds=[0.5],
number_of_examples=6,
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand Down Expand Up @@ -348,7 +360,10 @@ def test_confusion_matrix_with_color_example(
"red": {"count": 1, "examples": [{"datum": "uid2"}]}
},
},
"parameters": {"score_threshold": 0.5},
"parameters": {
"score_threshold": 0.5,
"maximum_number_of_examples": 6,
},
},
]
for m in actual_metrics:
Expand Down Expand Up @@ -380,9 +395,9 @@ def test_confusion_matrix_multiclass(
actual_metrics = evaluator.compute_confusion_matrix(
score_thresholds=[0.05, 0.5, 0.85],
number_of_examples=5,
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand Down Expand Up @@ -427,6 +442,7 @@ def test_confusion_matrix_multiclass(
},
"parameters": {
"score_threshold": 0.05,
"maximum_number_of_examples": 5,
},
},
{
Expand Down Expand Up @@ -458,7 +474,10 @@ def test_confusion_matrix_multiclass(
"bee": {"count": 1, "examples": [{"datum": "uid1"}]},
},
},
"parameters": {"score_threshold": 0.5},
"parameters": {
"score_threshold": 0.5,
"maximum_number_of_examples": 5,
},
},
{
"type": "ConfusionMatrix",
Expand All @@ -478,6 +497,7 @@ def test_confusion_matrix_multiclass(
},
"parameters": {
"score_threshold": 0.85,
"maximum_number_of_examples": 5,
},
},
]
Expand Down Expand Up @@ -511,9 +531,9 @@ def test_confusion_matrix_without_hardmax_animal_example(
score_thresholds=[0.05, 0.4, 0.5],
number_of_examples=6,
hardmax=False,
as_dict=True,
)

actual_metrics = [m.to_dict() for m in actual_metrics]
expected_metrics = [
{
"type": "ConfusionMatrix",
Expand Down Expand Up @@ -542,7 +562,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
},
"missing_predictions": {},
},
"parameters": {"score_threshold": 0.05},
"parameters": {
"score_threshold": 0.05,
"maximum_number_of_examples": 6,
},
},
{
"type": "ConfusionMatrix",
Expand All @@ -559,7 +582,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
},
"missing_predictions": {},
},
"parameters": {"score_threshold": 0.4},
"parameters": {
"score_threshold": 0.4,
"maximum_number_of_examples": 6,
},
},
{
"type": "ConfusionMatrix",
Expand All @@ -576,7 +602,10 @@ def test_confusion_matrix_without_hardmax_animal_example(
}
},
},
"parameters": {"score_threshold": 0.5},
"parameters": {
"score_threshold": 0.5,
"maximum_number_of_examples": 6,
},
},
]
for m in actual_metrics:
Expand Down
Loading

0 comments on commit 8aa7c02

Please sign in to comment.