initial checkin

Comcast · Aug 19, 2024 · aa21eb7 · aa21eb7
1 parent 5c413c0
commit aa21eb7
Show file tree

Hide file tree

Showing 5 changed files with 656 additions and 0 deletions.
diff --git a/calculate_statistics.py b/calculate_statistics.py
@@ -0,0 +1,234 @@
+import pandas as pd
+import os
+import numpy as np
+import sys
+
+pd.options.display.float_format = "{:.6f}".format
+from helper_functions import parse_file_name
+
+"""Runs scoring. Example call:
+`python calculate_statistics.py release_data/few_shot fewshot_scores`
+Writes to 
+`fewshot_scores/detail_accuracy.csv`: Parsed outputs for each model/task/run
+`fewshot_scores/low_med_high.csv` Accuracy, low, median, high
+`fewshot_scores/TARa.csv` TARa 
+`fewshot_scores/TARr.csv` TARr
+
+Note that mixtral-8x7b-instruct_bbh_navigate_2.csv and mixtral-8x7b-instruct_bbh_navigate_3.csv had the last row removed to match the 
+counts of runs 0, 1 and 4. 
+
+"""
+
+
+def check_element_wise_equality(lists):
+    counter = 0
+    for n_tuple in zip(*lists, strict=True):
+
+        checker = True
+        for i in range(len(n_tuple) - 1):
+            for j in range(i, len(n_tuple)):
+                if n_tuple[i] != n_tuple[j]:
+                    checker = False
+                    break
+        if checker:
+            counter += 1
+    return counter
+
+
+folder_path = sys.argv[1]
+dest_folder = sys.argv[2]
+if not os.path.exists(dest_folder):
+    print(f"making dir {dest_folder}")
+    os.makedirs(dest_folder)
+
+all_tasks = set()
+model_list = set()
+for file_name in os.listdir(folder_path):
+    if not file_name.endswith(".csv"):
+        continue
+    model_name, task, experiment_run = parse_file_name(file_name)
+    all_tasks.add(task)
+    model_list.add(model_name)
+
+model_list = list(model_list)
+results = {}
+results_intersection = {}
+results_raw_responses = {}
+aggregated_results = {}
+
+for model_name in model_list:
+    results[model_name] = {}
+    results_intersection[model_name] = {}
+    results_raw_responses[model_name] = {}
+    aggregated_results[model_name] = {}
+
+last_run_name = ""
+truth_count = -1
+last_run_questions = []
+for file_name in os.listdir(folder_path):
+    if not file_name.endswith(".csv"):
+        continue
+    model_name, task, experiment_run = parse_file_name(file_name)
+    run_name = f"{model_name} {task}"
+    dump_file = pd.read_csv(f"{folder_path}/{file_name}")
+    dump_file = dump_file.fillna("")  #
+    targets = dump_file["gt"]
+    if run_name != last_run_name:  # reset data checks with change of run_name
+        last_run_name = run_name
+        truth_count = len(targets)
+        last_run_questions = []
+    if len(targets) != truth_count:
+        raise RuntimeError(
+            f"Truth counts has changed, had {truth_count}, got {len(targets)} for {file_name}"
+        )
+    # Two passes of prediction are available, we look at both
+    # "new_extracted_pred" comes from running `postprocess_responses.py`
+    preds_1 = dump_file["pred"].apply(lambda x: x.lower().strip().replace(".", ""))
+    preds_2 = dump_file.get(
+        "new_extracted_pred", pd.Series(data=[""] * len(targets))
+    ).apply(lambda x: x.lower().strip().replace(".", ""))
+    truths = dump_file["gt"].apply(lambda x: x.lower().strip())
+    # preds = (
+    #     dump_file["new_extracted_pred"]
+    #     if "new_extracted_pred" in dump_file.columns
+    #     else dump_file["pred"]
+    # )
+    # not great, easy for bugs to get through.
+    correct = 0
+    total = 0
+    preds_1_counter = 0
+    used_preds = []
+    # if file_name == "llama-3-70b_bbh_navigate_0.csv":
+    #    breakpoint()
+    for i, truth in enumerate(truths):
+        if "question" in dump_file:  # test that questions match if there
+            question = dump_file["question"].iloc[i]
+            if i == len(last_run_questions):
+                last_run_questions.append(question)
+            elif question != last_run_questions[i]:
+                raise RuntimeError(
+                    f"Question does not match other runs {i} in {file_name}"
+                )
+        if truth == preds_2.iloc[i]:
+            correct += 1
+            used_preds.append(preds_1.iloc[i])
+        elif truth == preds_1.iloc[i]:
+            correct += 1
+            preds_1_counter += 1
+            used_preds.append(preds_2.iloc[i])
+        else:
+            used_preds.append(
+                f"Wrong pred1: {preds_1.iloc[i]} .... pred2: {preds_2.iloc[i]}"
+            )
+        total += 1
+    acc = correct / total
+    if preds_1_counter > 0:
+        print(f"** Preds 1 {preds_1_counter}/{total} {file_name}")
+
+    if task in results[model_name]:  # really bad, need defaultdict here
+        results[model_name][task].append(100 * acc)
+        results_intersection[model_name][task].append(used_preds)
+        results_raw_responses[model_name][task].append(
+            [response.lower().strip() for response in dump_file["raw_response"]]
+        )
+    else:
+        results[model_name][task] = [100 * acc]
+        results_intersection[model_name][task] = [used_preds]
+        results_raw_responses[model_name][task] = [
+            [response.lower().strip() for response in dump_file["raw_response"]]
+        ]
+
+# ACCURACY FOR EACH FILE DUMP
+tasks = []
+runs = []
+models = []
+accs = []
+
+for model_name in results:
+    for task in results[model_name]:
+        tasks.extend([task] * len(results[model_name][task]))
+        runs.extend(list(range(len(results[model_name][task]))))
+        models.extend([model_name] * len(results[model_name][task]))
+        accs.extend(results[model_name][task])
+
+
+pd.DataFrame({"model": models, "run": runs, "task": tasks, "accuracy": accs}).to_csv(
+    os.path.join(dest_folder, "detail_accuracy.csv")
+)
+
+# Calculate mean std etc.
+for model_name in results:
+    for task in results[model_name]:
+        median = np.median(results[model_name][task])
+        low = np.min(results[model_name][task])
+        high = np.max(results[model_name][task])
+        aggregated_results[model_name][task] = (low, median, high)
+
+# NOW CREATE A DATAFRAME
+tasks = []
+mean_results = {}
+std_results = {}
+var_results = {}
+intersection_percentages = {}
+raw_intersection_percentages = {}
+
+for model_name in model_list:
+    mean_results[model_name] = []
+    std_results[model_name] = []
+    var_results[model_name] = []
+    intersection_percentages[f"{model_name}_TARa"] = []
+    raw_intersection_percentages[f"{model_name}_TARr"] = []
+
+
+for task in all_tasks:
+    tasks.append(task)
+    for model in model_list:  # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
+        mean_results[model].append(aggregated_results[model][task][0])
+        std_results[model].append(aggregated_results[model][task][1])
+        var_results[model].append(aggregated_results[model][task][2])
+
+df_dict = {"task": tasks}
+for model in model_list:  # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
+    df_dict[f"{model}_low"] = mean_results[model]
+    df_dict[f"{model}_median"] = std_results[model]
+    df_dict[f"{model}_high"] = var_results[model]
+
+
+results_df = pd.DataFrame(df_dict)
+results_df.to_csv(os.path.join(dest_folder, "low_med_high.csv"))  # _0shotnoinst
+
+# Calculate intersection results
+tasks = []
+
+for task in all_tasks:
+    tasks.append(task)
+    for model in model_list:  # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
+        lists = results_intersection[model][task]
+        try:
+            total_intersection_count = check_element_wise_equality(lists)
+        except:
+            breakpoint()
+        intersection_percentages[f"{model}_TARa"].append(
+            100 * total_intersection_count / len(lists[0])
+        )
+
+answ_TAR_df = pd.DataFrame({"task": tasks, **intersection_percentages})
+answ_TAR_df = answ_TAR_df.reindex(sorted(answ_TAR_df.columns), axis=1)
+answ_TAR_df = answ_TAR_df.to_csv(os.path.join(dest_folder, "TARa.csv"))
+
+
+# Calculate exact match intersection
+tasks = []
+
+for task in all_tasks:
+    tasks.append(task)
+    for model in model_list:  # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
+        lists = results_raw_responses[model][task]
+        total_intersection_count = check_element_wise_equality(lists)
+        raw_intersection_percentages[f"{model}_TARr"].append(
+            100 * total_intersection_count / len(lists[0])
+        )
+
+raw_TAR_df = pd.DataFrame({"task": tasks, **raw_intersection_percentages})
+raw_TAR_df = raw_TAR_df.reindex(sorted(raw_TAR_df.columns), axis=1)
+raw_TAR_df.to_csv(os.path.join(dest_folder, "TARr.csv"))
diff --git a/main.py b/main.py
@@ -0,0 +1,88 @@
+import argparse
+from datasets import load_dataset
+from tqdm import tqdm
+import re
+import pandas as pd
+from helper_functions import (
+    get_fewshot_examples,
+    convert_mmlu_data,
+    configure_model,
+    set_seed,
+    discard_text_after_answer,
+    make_inference,
+)
+
+"""Top level main function to run LLMs."""
+
+parser = argparse.ArgumentParser(description="")
+parser.add_argument("--task", type=str)
+parser.add_argument("--model", type=str)
+parser.add_argument("--num_fewshot", type=int)
+parser.add_argument("--experiment_run", type=int)
+
+args = parser.parse_args()
+set_seed(123)
+
+if "bbh" in args.task:
+    subset_name = args.task.split("_")
+    subset_name = "_".join(subset_name[1:])
+    test_data = load_dataset("lukaemon/bbh", name=subset_name, split="test")
+elif "mmlu" in args.task:
+    subset_name = args.task.split("_")
+    subset_name = "_".join(subset_name[1:])
+    test_data = load_dataset("cais/mmlu", name=subset_name, split="test")
+    test_data = convert_mmlu_data(test_data)
+else:
+    raise Exception(f"{args.task} is not supported")
+
+if "llama-3-70b" == args.model:
+    model_name = "databricks-meta-llama-3-70b-instruct"
+elif "mixtral-8x7b-instruct" == args.model:
+    model_name = "databricks-mixtral-8x7b-instruct"
+elif "finetuned-3.5" == args.model:
+    model_name = "mmlu"
+else:
+    model_name = args.model
+
+model_client = configure_model(args.model)
+few_shot_data = get_fewshot_examples(subset_name)
+tokenizer = None
+
+raw_responses = []
+predictions = []
+ground_truths = []
+need_check = False
+
+raw_inputs = []
+
+for example in tqdm(test_data):
+    few_shot_text = ""
+    for ex in few_shot_data["samples"][: args.num_fewshot]:
+        few_shot_text = few_shot_text + ex["input"] + f"A:{ex['target']}\n"
+    message = f"{few_shot_text} {example['input']} A:"
+    raw_response = make_inference(model_client, model_name, message)
+    raw_inputs.append(message)
+    ground_truths.append(example["target"])
+    raw_responses.append(raw_response)
+    try:
+        pred = re.findall("(?<=answer is )(\S*)", raw_response)[0]
+        pred = discard_text_after_answer(pred)
+    except:
+        try:
+            pred = re.findall("(?<= is )(\([ABCDabcd]\))(?=.)", raw_response)[0]
+            pred = discard_text_after_answer(pred)
+        except:
+            need_check = True
+            print("CHECK the Raw responses!!!")
+            pred = raw_response
+    predictions.append(pred)
+
+
+pd.DataFrame(
+    {
+        "pred": predictions,
+        "gt": ground_truths,
+        "raw_response": raw_responses,
+        "raw_input": raw_inputs,
+    }
+).to_csv(f"{args.model}_{args.task}_{args.experiment_seed}.csv")