From febabdcb6b53601369a6cfe72a25964421488755 Mon Sep 17 00:00:00 2001 From: Jathurshan0330 Date: Mon, 4 Nov 2024 12:19:07 -0600 Subject: [PATCH] completed --- arrange_labels.py | 27 +++++++++++++++++++ clinical_trial_linkage/match_fda_approvals.py | 3 ++- .../clean_and_extract_final_outcomes.py | 8 ++++++ pipeline.sh | 13 +++++---- 4 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 arrange_labels.py diff --git a/arrange_labels.py b/arrange_labels.py new file mode 100644 index 0000000..d512d0a --- /dev/null +++ b/arrange_labels.py @@ -0,0 +1,27 @@ +import argparse +import os + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Arrange labels for CTOD dataset') + parser.add_argument('--save_path', type=str, help='Path to the folder to save the arranged labels') + + args = parser.parse_args() + + label_save_path = os.path.join(args.save_path, 'outcome_labels') + if not os.path.exists(label_save_path): + os.makedirs(label_save_path) + #copy gpt labels to label_save_path + gpt_labels_path = os.path.join(args.save_path, 'llm_predictions_on_pubmed/pubmed_gpt_outcomes.csv') + + os.system(f'cp {gpt_labels_path} {label_save_path}/pubmed_gpt_outcomes.csv') + + # copy trial linkage labels to label_save_path + trial_linkage_labels_path = os.path.join(args.save_path, 'clinical_trial_linkage/trial_linkages/outcome_labels/Merged_all_trial_linkage_outcome_df__FDA_updated.csv ') + trial_linkage_save_path = str(os.path.join(label_save_path, 'Merged_all_trial_linkage_outcome_df__FDA_updated.csv')) + os.system(f"cp {trial_linkage_labels_path} {trial_linkage_save_path}") + + + + \ No newline at end of file diff --git a/clinical_trial_linkage/match_fda_approvals.py b/clinical_trial_linkage/match_fda_approvals.py index e254d8c..48ea6c5 100644 --- a/clinical_trial_linkage/match_fda_approvals.py +++ b/clinical_trial_linkage/match_fda_approvals.py @@ -176,7 +176,8 @@ def match_FDA_approvals_main(save_path,merged_all_pd_path,cross_encoder,dev=Fals if merged_all_pd.iloc[i]['outcome'] != 'Success': merged_all_pd.at[i, 'outcome'] = 'Success' - merge_all_save_path = merged_all_pd_path.split('.csv')[0] + '_FDA_updated.csv' + # merge_all_save_path = merged_all_pd_path.split('.csv')[0] + '_FDA_updated.csv' + merge_all_save_path = os.path.join(save_path, 'outcome_labels','Merged_all_trial_linkage_outcome_df__FDA_updated.csv' ) merged_all_pd.to_csv(merge_all_save_path, index=False) print('Finished updating merged_all_pd with FDA approvals') diff --git a/llm_prediction_on_pubmed/clean_and_extract_final_outcomes.py b/llm_prediction_on_pubmed/clean_and_extract_final_outcomes.py index 211278c..92dc39c 100644 --- a/llm_prediction_on_pubmed/clean_and_extract_final_outcomes.py +++ b/llm_prediction_on_pubmed/clean_and_extract_final_outcomes.py @@ -33,6 +33,14 @@ def main(gpt_decisions_path,top_2_pubmed_path): # add to gpt_trial_outcomes using concat gpt_trial_outcomes = pd.concat([gpt_trial_outcomes, pd.DataFrame({'nct_id': [trial], 'outcome': [trial_outcome]})]) + elif os.path.exists(os.path.join(gpt_decisions_path,f'{trial}_gpt_response.txt')): + with open(os.path.join(gpt_decisions_path,f'{trial}_gpt_response.txt'), 'r') as f: + trial_outcome = f.read() + f.close() + trial_outcome = trial_outcome.split(""""outcome":""")[-1].split(',')[0].split('"')[1].split('"')[0] + + # add to gpt_trial_outcomes using concat + gpt_trial_outcomes = pd.concat([gpt_trial_outcomes, pd.DataFrame({'nct_id': [trial], 'outcome': [trial_outcome]})]) # get common nct_ids in top2_pubmed_pd and gpt_trial_outcomes common_nct_ids = list(set(top2_pubmed_pd['nct_id'].values).intersection(set(gpt_trial_outcomes['nct_id'].values))) diff --git a/pipeline.sh b/pipeline.sh index b3944fe..9c0f806 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -16,8 +16,8 @@ cd llm_prediction_on_pubmed # python extract_pubmed_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH #--dev # echo "Search Pubmed and extract abstracts" # python extract_pubmed_abstracts_through_search.py --data_path $DATA_PATH --save_path $SAVE_PATH #--dev -echo "Retrieving top 2 relevant abstracts" -python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH #--dev +# echo "Retrieving top 2 relevant abstracts" +# python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH #--dev # echo "Obtaining LLM predictions" # python get_llm_predictions.py --save_path $SAVE_PATH --azure #--dev # python clean_and_extract_final_outcomes.py --save_path $SAVE_PATH @@ -25,8 +25,8 @@ python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH # # # # Getting Clinical Trial Linkage # echo "Getting Clinical Trial Linkage" -# cd .. -# cd clinical_trial_linkage +cd .. +cd clinical_trial_linkage # echo "Downloading FDA orange book and drug code dictionary" # python download_data.py --save_path $SAVE_PATH # centralize the links in the .sh @@ -55,6 +55,7 @@ python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH # python match_fda_approvals.py --save_path $SAVE_PATH #--dev + # News @@ -62,6 +63,8 @@ python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH # Labeling - +# echo "Copy all labeling results to the labeling folder" +cd .. +python arrange_labels.py --save_path $SAVE_PATH # limit it to drugs \ No newline at end of file