added download_ctti

chufangao · Sep 26, 2024 · c36442d · c36442d
1 parent ba7ea26
commit c36442d
Show file tree

Hide file tree

Showing 4 changed files with 216 additions and 19 deletions.
diff --git a/download_ctti.py b/download_ctti.py
@@ -0,0 +1,100 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import Select
+import time
+import os
+
+import argparse
+import zipfile
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Download CTTI data')
+
+    # savepath
+    parser.add_argument('--save_path', type=str, default='', help='path to save the data')
+
+    args = parser.parse_args()
+    # Desired download path
+    download_path = os.path.join(args.save_path, 'CTTI_raw')
+
+    new_path = os.path.join(args.save_path, 'CTTI_new')
+    old_path = os.path.join(args.save_path, 'CTTI_old')
+
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
+    if not os.path.exists(new_path):
+        os.makedirs(new_path)
+    if not os.path.exists(old_path):
+        os.makedirs(old_path)
+
+    # if CTTI_new is not empty, move all files to CTTI_old folder
+    if os.listdir(new_path):
+        print("Moving files from CTTI_new to CTTI_old")
+        for file in os.listdir(new_path):
+            os.rename(os.path.join(new_path, file), os.path.join(old_path, file))
+
+    # delete any files in the CTTI_new folder
+    for file in os.listdir(new_path):
+        print(f"Deleting {file}")
+        os.remove(os.path.join(new_path, file))
+
+    # delete any files in the download folder
+    for file in os.listdir(download_path):
+        print(f"Deleting old {file}")
+        os.remove(os.path.join(download_path, file))
+
+    # Set Chrome options to configure headless mode and download path
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+
+    # Configure Chrome to download files to the specified directory
+    prefs = {"download.default_directory": download_path}
+    chrome_options.add_experimental_option("prefs", prefs)
+
+    # Initialize the Chrome driver with headless options
+    driver = webdriver.Chrome(options=chrome_options)
+
+    # Go to the website
+    driver.get('https://aact.ctti-clinicaltrials.org/pipe_files')
+
+    # Wait until the dropdown is populated (wait up to 20 seconds)
+    wait = WebDriverWait(driver, 20)
+    dropdown = wait.until(EC.presence_of_element_located((By.XPATH, '//select[@class="form-select"]')))
+
+    # Create a Select object
+    select = Select(dropdown)
+
+    # Select the first option
+    select.select_by_index(1)
+
+    # Wait for the file to download (adjust as needed)
+    # check if .crdownload file is still being downloaded
+    time.sleep(10)
+    start = time.time()
+    while any(fname.endswith('.crdownload') for fname in os.listdir(download_path)):
+        time.sleep(10)
+        if time.time() - start > 600:
+            print("File download timed out")
+            break
+
+    # Close the driver
+    driver.quit()
+
+    print(f"File downloaded to: {download_path}")
+
+    #get the name of the downloaded file
+    file_name = os.listdir(download_path)[0]
+
+    with zipfile.ZipFile(os.path.join(download_path, file_name), 'r') as zip_ref:
+        zip_ref.extractall(new_path)
+
+    print(f"File extracted to: {new_path}")
+
+
diff --git a/llm_prediction_on_pubmed/README.md b/llm_prediction_on_pubmed/README.md
@@ -21,7 +21,7 @@ python extract_pubmed_abstracts.py --data_path <Path to CITI data> --NCBI_api_ke
 - To make the process efficient, we initially retrieve the top 2 most relevant abstracts (as shown in the figure above) and save them in a data frame.
 
 ```jsx
-python retrieve_top2_abstracts.py --data_path <Path to CITI data> --pubmed_path <Path to the extracted pubmed data>
+python retrieve_top2_abstracts.py --data_path <Path to CITI data> --save_path <Path to save data>
 ```
 
 The resultant data frame will be saved at /<pubmed_path>/top_2_extracted_pubmed_articles.csv

diff --git a/llm_prediction_on_pubmed/retrieve_top2_abstracts.py b/llm_prediction_on_pubmed/retrieve_top2_abstracts.py
@@ -17,22 +17,32 @@
 - Number of publications before trial completion
 '''
 
+from hmac import new
 import json
 import glob
+from turtle import update
+from numpy import save
+from torch import ne
 from tqdm import tqdm
 from sentence_transformers import SentenceTransformer, util,models
 import os
+import time
 import pandas as pd
 import argparse
 from support_functions import extract_study_basic_info,filter_articles,extract_similar_pubmed_articles
 
 
 
 
-def main(data_path,pubmed_path):
+def main(data_path,save_path,dev = False):
     # read extracted pubmed_articles
+    pubmed_path = os.path.join(save_path,'extracted_pubmed')
+    save_path = os.path.join(save_path,'llm_predict')
 
 
+
+
+    # read extracted pubmed articles      
     pubmed_files = glob.glob(os.path.join(pubmed_path,'extracted_pubmed','*_pubmed_abs.json'))
     print(f"Total number of pubmed files: {len(pubmed_files)}")
     embeddings = models.Transformer(
@@ -51,13 +61,26 @@ def main(data_path,pubmed_path):
 
     # Extract top 2 similar articles based on trial title and abstract title and create a dataframe
     new_rows = []
+    # read previous top 2 extracted pubmed articles csv in llm_prediction folder
+    top_2_exists = False
+    if os.path.exists(os.path.join(save_path,'top_2_extracted_pubmed_articles.csv')):
+        print('Reading previously extracted top 2 similar articles')
+        top_2_exists = True
+        top_2_prev_df = pd.read_csv(os.path.join(save_path,'top_2_extracted_pubmed_articles.csv'))
+        # convert to list of dictionaries
+        new_rows = top_2_prev_df.to_dict('records') # changed to list of dictionaries
+
+    updated_nct_id = []
+    new_nct_id =[]
     # # read data frame and append the rows to new_rows
     # pubmed_df = pd.read_csv('./top_2_extracted_pubmed_articles.csv')
     # print(len(pubmed_df))
     # for i in range(len(pubmed_df)):
     #     new_rows.append(pubmed_df.iloc[i].to_dict())
 
-
+    # for development mode 
+    num = 0
+
     for jsonfile in tqdm(pubmed_files):
         nct_id = jsonfile.split('/')[-1].split('_')[0]
         # if nct_id in pubmed_df['nct_id'].values:
@@ -97,32 +120,98 @@ def main(data_path,pubmed_path):
             for article in pubmed_all_data['References']:
                 if article['Reference type'] == article_type:
                     row[article_type] += 1
-        new_rows.append(row)
+
+        # check if the PMID is in the new_rows
+        # check if the ncit_id exists in the new_rows
+        if top_2_exists:
+            nct_id_exists = False
+            for i in range(len(new_rows)):
+                if new_rows[i]['nct_id'] == nct_id:
+                    prev_row = new_rows[i]
+                    nct_id_exists = True
+                    break
+            if nct_id_exists:
+                # check if the PMID exists in the new_rows
+                prev_pmid_list = []
+                for i in range(1,3):
+                    prev_pmid_list.append(prev_row[f'top_{i}_similar_article_PMID'])
+                if row['top_1_similar_article_PMID'] not in prev_pmid_list or row['top_2_similar_article_PMID'] not in prev_pmid_list:
+
+                    # delete the previous row from new_rows and append the new row
+                    new_rows.remove(prev_row)
+                    new_rows.append(row)
+                    updated_nct_id.append(nct_id)
+                else:
+                    continue
+            else:
+                new_rows.append(row)
+                new_nct_id.append(nct_id)
+        else:
+            new_rows.append(row)
+            new_nct_id.append(nct_id)
+
+        num += 1
+        # for development mode
+        if dev and num == 5000:
+            print('Development mode: break')
+            break
 
-        if len(new_rows) % 10000 == 0:
-            pubmed_df = pd.DataFrame(new_rows)  
-            pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)
+        # if len(new_rows) % 10000 == 0:
+        #     pubmed_df = pd.DataFrame(new_rows)  
+        #     pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)
 
     pubmed_df = pd.DataFrame(new_rows)  
     pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)
+
+    # log all updated nct_id with date to log file
+    if top_2_exists:
+        with open('./logs/pubmed_reference_logs.txt', 'a') as f:
+            f.write('====================\n')
+            f.write(f'Update time: {time.ctime()}\n')
+            f.write('Top 2 similar articles updated for the following nct_ids:\n')
+            f.write(f'Updated {len(updated_nct_id)} nct_id: {updated_nct_id}\n')
+            f.write('Following nct_ids are new:\n')
+            f.write(f'New {len(new_nct_id)} nct_id: {new_nct_id}\n')
+            f.close()
+        print(f'{time.ctime()}: Updated {len(updated_nct_id)} nct_id: {updated_nct_id}')
+        print(f'{time.ctime()}: New {len(new_nct_id)} nct_id: {new_nct_id}')
+        print('Top 2 similar articles updated')
+    else:
+        print('Top 2 similar articles extracted')
+        with open('./logs/pubmed_reference_logs.txt', 'a') as f:
+            f.write('====================\n')
+            f.write(f'Update time: {time.ctime()}\n')
+            f.write('Top 2 similar articles extracted\n')
+            f.write(f'Numeber of nct_ids: {len(new_nct_id)}\n')
+            f.close()
+        print(f'{time.ctime()}: New {len(new_nct_id)} nct_id: {new_nct_id}')
+        print('Top 2 similar articles extracted')
+
+    # 
 #     # break
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--data_path', type=str, default=None , help='Path to the CITI data folder')
-    parser.add_argument('--pubmed_path', type=str, default=None , help='Path to the extracted pubmed data')
+    # parser.add_argument('--pubmed_path', type=str, default=None , help='Path to the extracted pubmed data')
+    parser.add_argument('--save_path', type=str, default= None, help='Path to save the extracted data')
+    parser.add_argument('--dev', action='store_true', help='Run in development mode')
+
     args = parser.parse_args()
 
     if args.data_path is None:
         raise ValueError('Please provide the path to the CITI data folder')
-    if args.pubmed_path is None:
-        raise ValueError('Please provide the path to the extracted pubmed data')
+    if args.save_path is None:
+        raise ValueError('Please provide the path to main folder where extracted pubmed data is saved')
 
     data_path = args.data_path
-    pubmed_path = args.pubmed_path
+    # pubmed_path = args.pubmed_path
+    save_path = args.save_path
 
-    os.chdir(args.pubmed_path)
+    # os.chdir(args.pubmed_path)
+    os.chdir(args.save_path)
 
-    main(data_path,pubmed_path)
+    # main(data_path,pubmed_path)
+    main(data_path,save_path,args.dev)
 
diff --git a/pipeline.sh b/pipeline.sh
@@ -1,11 +1,19 @@
 DATA_PATH=/srv/local/data/jp65/CTO/raw_data/76a6hbbrw9v9fpi6oycfq7x6ofgx 
-NCBI_API_KEY=''
+NCBI_API_KEY='558a8ec64b0df1607941d0261d0a5d273308'
 SAVE_PATH=/srv/local/data/jp65/CTO
 
 
-# Getting LLM predictions on Pubmed data
-echo "Getting LLM predictions on Pubmed data"
-cd llm_prediction_on_pubmed 
+# Downloading CTTI new data
+echo "Downloading CTTI new data"
+python download_ctti.py --save_path $SAVE_PATH 
 
-echo " Extracting and Updating Pubmed data"
-python extract_pubmed_abstracts.py --data_path $DATA_PATH --NCBI_api_key $NCBI_API_KEY --save_path $SAVE_PATH --dev 
+
+
+# # Getting LLM predictions on Pubmed data
+# echo "Getting LLM predictions on Pubmed data"
+# cd llm_prediction_on_pubmed 
+
+# echo "Extracting and Updating Pubmed data"
+# python extract_pubmed_abstracts.py --data_path $DATA_PATH --NCBI_api_key $NCBI_API_KEY --save_path $SAVE_PATH --dev 
+# echo 
+# python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH --dev