import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
+
+# Define thresholds to evaluate
+thresholds = np.linspace(id_energy_scores.min(), id_energy_scores.max(), 50)
+
+# Store evaluation metrics for each threshold
+accuracies = []
+precisions = []
+recalls = []
+f1_scores = []
+
+# True labels for OOD data (since they are not part of the original labels)
+ood_true_labels = np.full(len(ood_energy_scores), -1)
+
+# We need the test_labels to be aligned with the ID data
+id_true_labels = test_labels[:len(id_energy_scores)]
+
+for threshold in thresholds:
+# Classify OOD examples based on energy scores
+ ood_classifications = np.where(ood_energy_scores >= threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < threshold, 0, -1)) # classified as ID
+
+# Classify ID examples based on energy scores
+ id_classifications = np.where(id_energy_scores >= threshold, -1, # classified as OOD
+ np.where(id_energy_scores < threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+ all_predictions = np.concatenate([ood_classifications, id_classifications])
+ all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Evaluate metrics
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, labels=[0, 1], average='macro', zero_division=0)
+ accuracy = accuracy_score(all_true_labels, all_predictions)
+
+ accuracies.append(accuracy)
+ precisions.append(precision)
+ recalls.append(recall)
+ f1_scores.append(f1)
+
+# Find the best thresholds for each metric
+best_f1_index = np.argmax(f1_scores)
+best_f1_threshold = thresholds[best_f1_index]
+
+best_precision_index = np.argmax(precisions)
+best_precision_threshold = thresholds[best_precision_index]
+
+best_recall_index = np.argmax(recalls)
+best_recall_threshold = thresholds[best_recall_index]
+
+print(f"Best F1 threshold: {best_f1_threshold}, F1 Score: {f1_scores[best_f1_index]}")
+print(f"Best Precision threshold: {best_precision_threshold}, Precision: {precisions[best_precision_index]}")
+print(f"Best Recall threshold: {best_recall_threshold}, Recall: {recalls[best_recall_index]}")
+
+# Plot metrics as functions of the threshold
+plt.figure(figsize=(12, 8))
+plt.plot(thresholds, precisions, label='Precision', color='g')
+plt.plot(thresholds, recalls, label='Recall', color='b')
+plt.plot(thresholds, f1_scores, label='F1 Score', color='r')
+
+# Add best threshold indicators
+plt.axvline(x=best_f1_threshold, color='r', linestyle='--', label=f'Best F1 Threshold: {best_f1_threshold:.2f}')
+plt.axvline(x=best_precision_threshold, color='g', linestyle='--', label=f'Best Precision Threshold: {best_precision_threshold:.2f}')
+plt.axvline(x=best_recall_threshold, color='b', linestyle='--', label=f'Best Recall Threshold: {best_recall_threshold:.2f}')
+
+plt.xlabel('Threshold')
+plt.ylabel('Metric Value')
+plt.title('Evaluation Metrics as Functions of Threshold (Energy-Based OOD Detection)')
+plt.legend()
+plt.show()
+
+
+
PYTHON
+
+
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+
+# Threshold value for the energy score
+upper_threshold = best_f1_threshold # Using the best F1 threshold from the previous calculation
+
+# Classifying OOD examples based on energy scores
+ood_classifications = np.where(ood_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < upper_threshold, 0, -1)) # classified as ID
+
+# Classifying ID examples based on energy scores
+id_classifications = np.where(id_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(id_energy_scores < upper_threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+all_predictions = np.concatenate([ood_classifications, id_classifications])
+all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Confusion matrix
+cm = confusion_matrix(all_true_labels, all_predictions, labels=[0, 1, -1])
+
+# Plotting the confusion matrix
+disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Shirt", "Pants", "OOD"])
+disp.plot(cmap=plt.cm.Blues)
+plt.title('Confusion Matrix for OOD and ID Classification (Energy-Based)')
+plt.show()
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
+
+# Define thresholds to evaluate
+thresholds = np.linspace(id_energy_scores.min(), id_energy_scores.max(), 50)
+
+# Store evaluation metrics for each threshold
+accuracies = []
+precisions = []
+recalls = []
+f1_scores = []
+
+# True labels for OOD data (since they are not part of the original labels)
+ood_true_labels = np.full(len(ood_energy_scores), -1)
+
+# We need the test_labels to be aligned with the ID data
+id_true_labels = test_labels[:len(id_energy_scores)]
+
+for threshold in thresholds:
+# Classify OOD examples based on energy scores
+ ood_classifications = np.where(ood_energy_scores >= threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < threshold, 0, -1)) # classified as ID
+
+# Classify ID examples based on energy scores
+ id_classifications = np.where(id_energy_scores >= threshold, -1, # classified as OOD
+ np.where(id_energy_scores < threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+ all_predictions = np.concatenate([ood_classifications, id_classifications])
+ all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Evaluate metrics
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, labels=[0, 1], average='macro', zero_division=0)
+ accuracy = accuracy_score(all_true_labels, all_predictions)
+
+ accuracies.append(accuracy)
+ precisions.append(precision)
+ recalls.append(recall)
+ f1_scores.append(f1)
+
+# Find the best thresholds for each metric
+best_f1_index = np.argmax(f1_scores)
+best_f1_threshold = thresholds[best_f1_index]
+
+best_precision_index = np.argmax(precisions)
+best_precision_threshold = thresholds[best_precision_index]
+
+best_recall_index = np.argmax(recalls)
+best_recall_threshold = thresholds[best_recall_index]
+
+print(f"Best F1 threshold: {best_f1_threshold}, F1 Score: {f1_scores[best_f1_index]}")
+print(f"Best Precision threshold: {best_precision_threshold}, Precision: {precisions[best_precision_index]}")
+print(f"Best Recall threshold: {best_recall_threshold}, Recall: {recalls[best_recall_index]}")
+
+# Plot metrics as functions of the threshold
+plt.figure(figsize=(12, 8))
+plt.plot(thresholds, precisions, label='Precision', color='g')
+plt.plot(thresholds, recalls, label='Recall', color='b')
+plt.plot(thresholds, f1_scores, label='F1 Score', color='r')
+
+# Add best threshold indicators
+plt.axvline(x=best_f1_threshold, color='r', linestyle='--', label=f'Best F1 Threshold: {best_f1_threshold:.2f}')
+plt.axvline(x=best_precision_threshold, color='g', linestyle='--', label=f'Best Precision Threshold: {best_precision_threshold:.2f}')
+plt.axvline(x=best_recall_threshold, color='b', linestyle='--', label=f'Best Recall Threshold: {best_recall_threshold:.2f}')
+
+plt.xlabel('Threshold')
+plt.ylabel('Metric Value')
+plt.title('Evaluation Metrics as Functions of Threshold (Energy-Based OOD Detection)')
+plt.legend()
+plt.show()
+
+
+
PYTHON
+
+
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+
+# Threshold value for the energy score
+upper_threshold = best_f1_threshold # Using the best F1 threshold from the previous calculation
+
+# Classifying OOD examples based on energy scores
+ood_classifications = np.where(ood_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < upper_threshold, 0, -1)) # classified as ID
+
+# Classifying ID examples based on energy scores
+id_classifications = np.where(id_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(id_energy_scores < upper_threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+all_predictions = np.concatenate([ood_classifications, id_classifications])
+all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Confusion matrix
+cm = confusion_matrix(all_true_labels, all_predictions, labels=[0, 1, -1])
+
+# Plotting the confusion matrix
+disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Shirt", "Pants", "OOD"])
+disp.plot(cmap=plt.cm.Blues)
+plt.title('Confusion Matrix for OOD and ID Classification (Energy-Based)')
+plt.show()
ID/OOD: In-distribution, out-of-distribution. Generally, the OOD
+instances can be defined as instances (x, y) sampled from an underlying
+distribution other than the training distribution P(Xtrain, Ytrain),
+where Xtrain and Ytrain are the training corpus and training label set,
+respectively.
+
OOD instances with semantic shift: OOD instances with semantic shift
+refer to instances that do not belong to y_train. More specifically,
+instances with semantic shift may come from unknown categories or
+irrelevant tasks.
+
OOD instances with covariate shift: OOD instances with non-semantic
+shift refer to the instances that belong to y_train but are sampled from
+a distribution other than x_train, e.g., a different
+domain/corpus/location.
+
Closed-world assumption: an assumption that the training and test
+data are sampled from the same distribution. However, training data can
+rarely capture the entire distribution. In real-world scenarios,
+out-of-distribution (OOD) instances, which come from categories that are
+not known to the model, can often be present in inference phases.
+
Inference-time OOD: After training, use some kind of scoring
+function to determine if test inputs are OOD or not.
+
Output-based OOD: Output-based OOD detection methods leverage the
+model’s output distribution to identify OOD instances. These methods
+typically involve analyzing the softmax scores, confidence scores, or
+other output statistics to detect anomalies.
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
+
+# Define thresholds to evaluate
+thresholds = np.linspace(id_energy_scores.min(), id_energy_scores.max(), 50)
+
+# Store evaluation metrics for each threshold
+accuracies = []
+precisions = []
+recalls = []
+f1_scores = []
+
+# True labels for OOD data (since they are not part of the original labels)
+ood_true_labels = np.full(len(ood_energy_scores), -1)
+
+# We need the test_labels to be aligned with the ID data
+id_true_labels = test_labels[:len(id_energy_scores)]
+
+for threshold in thresholds:
+# Classify OOD examples based on energy scores
+ ood_classifications = np.where(ood_energy_scores >= threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < threshold, 0, -1)) # classified as ID
+
+# Classify ID examples based on energy scores
+ id_classifications = np.where(id_energy_scores >= threshold, -1, # classified as OOD
+ np.where(id_energy_scores < threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+ all_predictions = np.concatenate([ood_classifications, id_classifications])
+ all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Evaluate metrics
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, labels=[0, 1], average='macro', zero_division=0)
+ accuracy = accuracy_score(all_true_labels, all_predictions)
+
+ accuracies.append(accuracy)
+ precisions.append(precision)
+ recalls.append(recall)
+ f1_scores.append(f1)
+
+# Find the best thresholds for each metric
+best_f1_index = np.argmax(f1_scores)
+best_f1_threshold = thresholds[best_f1_index]
+
+best_precision_index = np.argmax(precisions)
+best_precision_threshold = thresholds[best_precision_index]
+
+best_recall_index = np.argmax(recalls)
+best_recall_threshold = thresholds[best_recall_index]
+
+print(f"Best F1 threshold: {best_f1_threshold}, F1 Score: {f1_scores[best_f1_index]}")
+print(f"Best Precision threshold: {best_precision_threshold}, Precision: {precisions[best_precision_index]}")
+print(f"Best Recall threshold: {best_recall_threshold}, Recall: {recalls[best_recall_index]}")
+
+# Plot metrics as functions of the threshold
+plt.figure(figsize=(12, 8))
+plt.plot(thresholds, precisions, label='Precision', color='g')
+plt.plot(thresholds, recalls, label='Recall', color='b')
+plt.plot(thresholds, f1_scores, label='F1 Score', color='r')
+
+# Add best threshold indicators
+plt.axvline(x=best_f1_threshold, color='r', linestyle='--', label=f'Best F1 Threshold: {best_f1_threshold:.2f}')
+plt.axvline(x=best_precision_threshold, color='g', linestyle='--', label=f'Best Precision Threshold: {best_precision_threshold:.2f}')
+plt.axvline(x=best_recall_threshold, color='b', linestyle='--', label=f'Best Recall Threshold: {best_recall_threshold:.2f}')
+
+plt.xlabel('Threshold')
+plt.ylabel('Metric Value')
+plt.title('Evaluation Metrics as Functions of Threshold (Energy-Based OOD Detection)')
+plt.legend()
+plt.show()
+
+
+
PYTHON
+
+
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+
+# Threshold value for the energy score
+upper_threshold = best_f1_threshold # Using the best F1 threshold from the previous calculation
+
+# Classifying OOD examples based on energy scores
+ood_classifications = np.where(ood_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < upper_threshold, 0, -1)) # classified as ID
+
+# Classifying ID examples based on energy scores
+id_classifications = np.where(id_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(id_energy_scores < upper_threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+all_predictions = np.concatenate([ood_classifications, id_classifications])
+all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Confusion matrix
+cm = confusion_matrix(all_true_labels, all_predictions, labels=[0, 1, -1])
+
+# Plotting the confusion matrix
+disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Shirt", "Pants", "OOD"])
+disp.plot(cmap=plt.cm.Blues)
+plt.title('Confusion Matrix for OOD and ID Classification (Energy-Based)')
+plt.show()
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
+
+# Define thresholds to evaluate
+thresholds = np.linspace(id_energy_scores.min(), id_energy_scores.max(), 50)
+
+# Store evaluation metrics for each threshold
+accuracies = []
+precisions = []
+recalls = []
+f1_scores = []
+
+# True labels for OOD data (since they are not part of the original labels)
+ood_true_labels = np.full(len(ood_energy_scores), -1)
+
+# We need the test_labels to be aligned with the ID data
+id_true_labels = test_labels[:len(id_energy_scores)]
+
+for threshold in thresholds:
+# Classify OOD examples based on energy scores
+ ood_classifications = np.where(ood_energy_scores >= threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < threshold, 0, -1)) # classified as ID
+
+# Classify ID examples based on energy scores
+ id_classifications = np.where(id_energy_scores >= threshold, -1, # classified as OOD
+ np.where(id_energy_scores < threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+ all_predictions = np.concatenate([ood_classifications, id_classifications])
+ all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Evaluate metrics
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, labels=[0, 1], average='macro', zero_division=0)
+ accuracy = accuracy_score(all_true_labels, all_predictions)
+
+ accuracies.append(accuracy)
+ precisions.append(precision)
+ recalls.append(recall)
+ f1_scores.append(f1)
+
+# Find the best thresholds for each metric
+best_f1_index = np.argmax(f1_scores)
+best_f1_threshold = thresholds[best_f1_index]
+
+best_precision_index = np.argmax(precisions)
+best_precision_threshold = thresholds[best_precision_index]
+
+best_recall_index = np.argmax(recalls)
+best_recall_threshold = thresholds[best_recall_index]
+
+print(f"Best F1 threshold: {best_f1_threshold}, F1 Score: {f1_scores[best_f1_index]}")
+print(f"Best Precision threshold: {best_precision_threshold}, Precision: {precisions[best_precision_index]}")
+print(f"Best Recall threshold: {best_recall_threshold}, Recall: {recalls[best_recall_index]}")
+
+# Plot metrics as functions of the threshold
+plt.figure(figsize=(12, 8))
+plt.plot(thresholds, precisions, label='Precision', color='g')
+plt.plot(thresholds, recalls, label='Recall', color='b')
+plt.plot(thresholds, f1_scores, label='F1 Score', color='r')
+
+# Add best threshold indicators
+plt.axvline(x=best_f1_threshold, color='r', linestyle='--', label=f'Best F1 Threshold: {best_f1_threshold:.2f}')
+plt.axvline(x=best_precision_threshold, color='g', linestyle='--', label=f'Best Precision Threshold: {best_precision_threshold:.2f}')
+plt.axvline(x=best_recall_threshold, color='b', linestyle='--', label=f'Best Recall Threshold: {best_recall_threshold:.2f}')
+
+plt.xlabel('Threshold')
+plt.ylabel('Metric Value')
+plt.title('Evaluation Metrics as Functions of Threshold (Energy-Based OOD Detection)')
+plt.legend()
+plt.show()
+
+
+
PYTHON
+
+
import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+
+# Threshold value for the energy score
+upper_threshold = best_f1_threshold # Using the best F1 threshold from the previous calculation
+
+# Classifying OOD examples based on energy scores
+ood_classifications = np.where(ood_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(ood_energy_scores < upper_threshold, 0, -1)) # classified as ID
+
+# Classifying ID examples based on energy scores
+id_classifications = np.where(id_energy_scores >= upper_threshold, -1, # classified as OOD
+ np.where(id_energy_scores < upper_threshold, id_true_labels, -1)) # classified as ID
+
+# Combine OOD and ID classifications and true labels
+all_predictions = np.concatenate([ood_classifications, id_classifications])
+all_true_labels = np.concatenate([ood_true_labels, id_true_labels])
+
+# Confusion matrix
+cm = confusion_matrix(all_true_labels, all_predictions, labels=[0, 1, -1])
+
+# Plotting the confusion matrix
+disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Shirt", "Pants", "OOD"])
+disp.plot(cmap=plt.cm.Blues)
+plt.title('Confusion Matrix for OOD and ID Classification (Energy-Based)')
+plt.show()
ID/OOD: In-distribution, out-of-distribution. Generally, the OOD
+instances can be defined as instances (x, y) sampled from an underlying
+distribution other than the training distribution P(Xtrain, Ytrain),
+where Xtrain and Ytrain are the training corpus and training label set,
+respectively.
+
OOD instances with semantic shift: OOD instances with semantic shift
+refer to instances that do not belong to y_train. More specifically,
+instances with semantic shift may come from unknown categories or
+irrelevant tasks.
+
OOD instances with covariate shift: OOD instances with non-semantic
+shift refer to the instances that belong to y_train but are sampled from
+a distribution other than x_train, e.g., a different
+domain/corpus/location.
+
Closed-world assumption: an assumption that the training and test
+data are sampled from the same distribution. However, training data can
+rarely capture the entire distribution. In real-world scenarios,
+out-of-distribution (OOD) instances, which come from categories that are
+not known to the model, can often be present in inference phases.
+
Inference-time OOD: After training, use some kind of scoring
+function to determine if test inputs are OOD or not.
+
Output-based OOD: Output-based OOD detection methods leverage the
+model’s output distribution to identify OOD instances. These methods
+typically involve analyzing the softmax scores, confidence scores, or
+other output statistics to detect anomalies.