utils.py

from zipfile import ZipFile
import os 
import hashlib
from sklearn.metrics import accuracy_score, hamming_loss, confusion_matrix, recall_score, precision_score, roc_auc_score, roc_curve
from mlflow import log_metric
import numpy as np

def rename_files_to_sha256(path):
    """
        Rename all the files within a directory to their
        SHA256 equivalent.
    """
    # path = '/data/mari/binaries/benign'
    files = os.listdir(path)

    for filename in files:
        # if "code" not in filename:
        with open(os.path.join(path, filename), "rb") as f:
            bytes = f.read() # read entire file as bytes
            readable_hash = hashlib.sha256(bytes).hexdigest();
            print(readable_hash)
            os.rename(os.path.join(path, filename), os.path.join(path, readable_hash))
    

def compress_files(file_list):
    """
        Take a list of file paths and create a zip archive with all the files.
        It removes the file path.
        Return: test.zip
    """
    with ZipFile('test.zip', mode='w') as zf:
        for f in file_list:
            try:
                zf.write(f)
            except:
                print(f"{f} does not exist.")
                pass 
            

def get_fpr(y_true, y_pred):
    """
        Given the true and predicted labels calculate the FPR.
        Uses the confusion_matrix() from scikit-learn.
    """
    tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel()
    fpr = fp / (tn + fp)
    return fpr

def get_tpr_at_fpr(y_true, y_pred, target_fpr):
    """
    Given the true labels and the probabilities of the model
    calculate the TPR at a given FPR level.
    """
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    return np.interp(target_fpr, fpr, tpr)


def find_threshold(y_true, y_pred, fpr_target):
    """
    Given the true labels and the probabilities of the model
    calculate the decision threshold for a given FPR level.
    """
    fpr, _, thresh = roc_curve(y_true, y_pred)
    return np.interp(fpr_target, fpr, thresh)


def init_scores():
    scores = {
        "acc": [],
        "agg": [],
        "fpr": [],
        "rec": [],
        "pres": [],
        "auc": [],
        "confs": [],
        "threshold": [],
        # "rocs" : [],
        "nums": []
    }
    return scores


def log_and_score(y_proba, y_pred_target, y_test, scores, fpr_target, logging=True):

    thresh = find_threshold(y_test, y_proba, fpr_target)
    y_pred = [int(i > thresh) for i in y_proba]

    test_score = accuracy_score(y_test, y_pred)
    agg_score = 1.0 - hamming_loss(y_pred_target, y_pred)    
    fpr_score = get_fpr(y_test, y_pred)
    rec_score = recall_score(y_test, y_pred)
    pres_score = precision_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)

    print(f"Threshold for target FPR {fpr_target}: {thresh}")
    print("Accuracy score:", test_score)
    print("Agreement:", agg_score)
    print("FPR:", fpr_score)
    print("Recall:", rec_score)
    print("Precision:", pres_score)
    print("AUC:", auc_score)
    print("Confusion matrix:", conf_mat)
    
    # testing that the threshold is correct
    # print("tpr at threshold:", np.sum(y_proba[y_test == 1] > thresh) / np.sum(y_test == 1))

    scores["acc"].append(test_score)
    scores["agg"].append(agg_score)
    scores["fpr"].append(fpr_score)
    scores["rec"].append(rec_score)
    scores["pres"].append(pres_score)
    scores["auc"].append(auc_score)
    scores["threshold"].append(thresh)
    scores["confs"].append(conf_mat)

    if logging:
        num_samples = scores["nums"][-1]
        log_metric("accuracy", test_score, step=num_samples)
        log_metric("agreement", agg_score, step=num_samples)
        log_metric("FPR", fpr_score, step=num_samples)
        log_metric("Recall", rec_score, step=num_samples)
        log_metric("Precision", pres_score, step=num_samples)
        log_metric("AUC", auc_score, step=num_samples)
        log_metric("Threshold", thresh, step=num_samples)

    return scores