Skip to content

Commit

Permalink
initial checkin
Browse files Browse the repository at this point in the history
  • Loading branch information
Breck Baldwin committed Aug 19, 2024
1 parent 5c413c0 commit aa21eb7
Show file tree
Hide file tree
Showing 5 changed files with 656 additions and 0 deletions.
234 changes: 234 additions & 0 deletions calculate_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import pandas as pd
import os
import numpy as np
import sys

pd.options.display.float_format = "{:.6f}".format
from helper_functions import parse_file_name

"""Runs scoring. Example call:
`python calculate_statistics.py release_data/few_shot fewshot_scores`
Writes to
`fewshot_scores/detail_accuracy.csv`: Parsed outputs for each model/task/run
`fewshot_scores/low_med_high.csv` Accuracy, low, median, high
`fewshot_scores/TARa.csv` TARa
`fewshot_scores/TARr.csv` TARr
Note that mixtral-8x7b-instruct_bbh_navigate_2.csv and mixtral-8x7b-instruct_bbh_navigate_3.csv had the last row removed to match the
counts of runs 0, 1 and 4.
"""


def check_element_wise_equality(lists):
counter = 0
for n_tuple in zip(*lists, strict=True):

checker = True
for i in range(len(n_tuple) - 1):
for j in range(i, len(n_tuple)):
if n_tuple[i] != n_tuple[j]:
checker = False
break
if checker:
counter += 1
return counter


folder_path = sys.argv[1]
dest_folder = sys.argv[2]
if not os.path.exists(dest_folder):
print(f"making dir {dest_folder}")
os.makedirs(dest_folder)

all_tasks = set()
model_list = set()
for file_name in os.listdir(folder_path):
if not file_name.endswith(".csv"):
continue
model_name, task, experiment_run = parse_file_name(file_name)
all_tasks.add(task)
model_list.add(model_name)

model_list = list(model_list)
results = {}
results_intersection = {}
results_raw_responses = {}
aggregated_results = {}

for model_name in model_list:
results[model_name] = {}
results_intersection[model_name] = {}
results_raw_responses[model_name] = {}
aggregated_results[model_name] = {}

last_run_name = ""
truth_count = -1
last_run_questions = []
for file_name in os.listdir(folder_path):
if not file_name.endswith(".csv"):
continue
model_name, task, experiment_run = parse_file_name(file_name)
run_name = f"{model_name} {task}"
dump_file = pd.read_csv(f"{folder_path}/{file_name}")
dump_file = dump_file.fillna("") #
targets = dump_file["gt"]
if run_name != last_run_name: # reset data checks with change of run_name
last_run_name = run_name
truth_count = len(targets)
last_run_questions = []
if len(targets) != truth_count:
raise RuntimeError(
f"Truth counts has changed, had {truth_count}, got {len(targets)} for {file_name}"
)
# Two passes of prediction are available, we look at both
# "new_extracted_pred" comes from running `postprocess_responses.py`
preds_1 = dump_file["pred"].apply(lambda x: x.lower().strip().replace(".", ""))
preds_2 = dump_file.get(
"new_extracted_pred", pd.Series(data=[""] * len(targets))
).apply(lambda x: x.lower().strip().replace(".", ""))
truths = dump_file["gt"].apply(lambda x: x.lower().strip())
# preds = (
# dump_file["new_extracted_pred"]
# if "new_extracted_pred" in dump_file.columns
# else dump_file["pred"]
# )
# not great, easy for bugs to get through.
correct = 0
total = 0
preds_1_counter = 0
used_preds = []
# if file_name == "llama-3-70b_bbh_navigate_0.csv":
# breakpoint()
for i, truth in enumerate(truths):
if "question" in dump_file: # test that questions match if there
question = dump_file["question"].iloc[i]
if i == len(last_run_questions):
last_run_questions.append(question)
elif question != last_run_questions[i]:
raise RuntimeError(
f"Question does not match other runs {i} in {file_name}"
)
if truth == preds_2.iloc[i]:
correct += 1
used_preds.append(preds_1.iloc[i])
elif truth == preds_1.iloc[i]:
correct += 1
preds_1_counter += 1
used_preds.append(preds_2.iloc[i])
else:
used_preds.append(
f"Wrong pred1: {preds_1.iloc[i]} .... pred2: {preds_2.iloc[i]}"
)
total += 1
acc = correct / total
if preds_1_counter > 0:
print(f"** Preds 1 {preds_1_counter}/{total} {file_name}")

if task in results[model_name]: # really bad, need defaultdict here
results[model_name][task].append(100 * acc)
results_intersection[model_name][task].append(used_preds)
results_raw_responses[model_name][task].append(
[response.lower().strip() for response in dump_file["raw_response"]]
)
else:
results[model_name][task] = [100 * acc]
results_intersection[model_name][task] = [used_preds]
results_raw_responses[model_name][task] = [
[response.lower().strip() for response in dump_file["raw_response"]]
]

# ACCURACY FOR EACH FILE DUMP
tasks = []
runs = []
models = []
accs = []

for model_name in results:
for task in results[model_name]:
tasks.extend([task] * len(results[model_name][task]))
runs.extend(list(range(len(results[model_name][task]))))
models.extend([model_name] * len(results[model_name][task]))
accs.extend(results[model_name][task])


pd.DataFrame({"model": models, "run": runs, "task": tasks, "accuracy": accs}).to_csv(
os.path.join(dest_folder, "detail_accuracy.csv")
)

# Calculate mean std etc.
for model_name in results:
for task in results[model_name]:
median = np.median(results[model_name][task])
low = np.min(results[model_name][task])
high = np.max(results[model_name][task])
aggregated_results[model_name][task] = (low, median, high)

# NOW CREATE A DATAFRAME
tasks = []
mean_results = {}
std_results = {}
var_results = {}
intersection_percentages = {}
raw_intersection_percentages = {}

for model_name in model_list:
mean_results[model_name] = []
std_results[model_name] = []
var_results[model_name] = []
intersection_percentages[f"{model_name}_TARa"] = []
raw_intersection_percentages[f"{model_name}_TARr"] = []


for task in all_tasks:
tasks.append(task)
for model in model_list: # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
mean_results[model].append(aggregated_results[model][task][0])
std_results[model].append(aggregated_results[model][task][1])
var_results[model].append(aggregated_results[model][task][2])

df_dict = {"task": tasks}
for model in model_list: # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
df_dict[f"{model}_low"] = mean_results[model]
df_dict[f"{model}_median"] = std_results[model]
df_dict[f"{model}_high"] = var_results[model]


results_df = pd.DataFrame(df_dict)
results_df.to_csv(os.path.join(dest_folder, "low_med_high.csv")) # _0shotnoinst

# Calculate intersection results
tasks = []

for task in all_tasks:
tasks.append(task)
for model in model_list: # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
lists = results_intersection[model][task]
try:
total_intersection_count = check_element_wise_equality(lists)
except:
breakpoint()
intersection_percentages[f"{model}_TARa"].append(
100 * total_intersection_count / len(lists[0])
)

answ_TAR_df = pd.DataFrame({"task": tasks, **intersection_percentages})
answ_TAR_df = answ_TAR_df.reindex(sorted(answ_TAR_df.columns), axis=1)
answ_TAR_df = answ_TAR_df.to_csv(os.path.join(dest_folder, "TARa.csv"))


# Calculate exact match intersection
tasks = []

for task in all_tasks:
tasks.append(task)
for model in model_list: # ["gpt-4o", "gpt-3.5-turbo"]: #model_list:
lists = results_raw_responses[model][task]
total_intersection_count = check_element_wise_equality(lists)
raw_intersection_percentages[f"{model}_TARr"].append(
100 * total_intersection_count / len(lists[0])
)

raw_TAR_df = pd.DataFrame({"task": tasks, **raw_intersection_percentages})
raw_TAR_df = raw_TAR_df.reindex(sorted(raw_TAR_df.columns), axis=1)
raw_TAR_df.to_csv(os.path.join(dest_folder, "TARr.csv"))
88 changes: 88 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import argparse
from datasets import load_dataset
from tqdm import tqdm
import re
import pandas as pd
from helper_functions import (
get_fewshot_examples,
convert_mmlu_data,
configure_model,
set_seed,
discard_text_after_answer,
make_inference,
)

"""Top level main function to run LLMs."""

parser = argparse.ArgumentParser(description="")
parser.add_argument("--task", type=str)
parser.add_argument("--model", type=str)
parser.add_argument("--num_fewshot", type=int)
parser.add_argument("--experiment_run", type=int)

args = parser.parse_args()
set_seed(123)

if "bbh" in args.task:
subset_name = args.task.split("_")
subset_name = "_".join(subset_name[1:])
test_data = load_dataset("lukaemon/bbh", name=subset_name, split="test")
elif "mmlu" in args.task:
subset_name = args.task.split("_")
subset_name = "_".join(subset_name[1:])
test_data = load_dataset("cais/mmlu", name=subset_name, split="test")
test_data = convert_mmlu_data(test_data)
else:
raise Exception(f"{args.task} is not supported")

if "llama-3-70b" == args.model:
model_name = "databricks-meta-llama-3-70b-instruct"
elif "mixtral-8x7b-instruct" == args.model:
model_name = "databricks-mixtral-8x7b-instruct"
elif "finetuned-3.5" == args.model:
model_name = "mmlu"
else:
model_name = args.model

model_client = configure_model(args.model)
few_shot_data = get_fewshot_examples(subset_name)
tokenizer = None

raw_responses = []
predictions = []
ground_truths = []
need_check = False

raw_inputs = []

for example in tqdm(test_data):
few_shot_text = ""
for ex in few_shot_data["samples"][: args.num_fewshot]:
few_shot_text = few_shot_text + ex["input"] + f"A:{ex['target']}\n"
message = f"{few_shot_text} {example['input']} A:"
raw_response = make_inference(model_client, model_name, message)
raw_inputs.append(message)
ground_truths.append(example["target"])
raw_responses.append(raw_response)
try:
pred = re.findall("(?<=answer is )(\S*)", raw_response)[0]
pred = discard_text_after_answer(pred)
except:
try:
pred = re.findall("(?<= is )(\([ABCDabcd]\))(?=.)", raw_response)[0]
pred = discard_text_after_answer(pred)
except:
need_check = True
print("CHECK the Raw responses!!!")
pred = raw_response
predictions.append(pred)


pd.DataFrame(
{
"pred": predictions,
"gt": ground_truths,
"raw_response": raw_responses,
"raw_input": raw_inputs,
}
).to_csv(f"{args.model}_{args.task}_{args.experiment_seed}.csv")
Loading

0 comments on commit aa21eb7

Please sign in to comment.