Skip to content

Commit

Permalink
added download_ctti
Browse files Browse the repository at this point in the history
  • Loading branch information
Jathurshan0330 committed Sep 26, 2024
1 parent ba7ea26 commit c36442d
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 19 deletions.
100 changes: 100 additions & 0 deletions download_ctti.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import time
import os

import argparse
import zipfile



if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download CTTI data')

# savepath
parser.add_argument('--save_path', type=str, default='', help='path to save the data')

args = parser.parse_args()
# Desired download path
download_path = os.path.join(args.save_path, 'CTTI_raw')

new_path = os.path.join(args.save_path, 'CTTI_new')
old_path = os.path.join(args.save_path, 'CTTI_old')

if not os.path.exists(download_path):
os.makedirs(download_path)
if not os.path.exists(new_path):
os.makedirs(new_path)
if not os.path.exists(old_path):
os.makedirs(old_path)

# if CTTI_new is not empty, move all files to CTTI_old folder
if os.listdir(new_path):
print("Moving files from CTTI_new to CTTI_old")
for file in os.listdir(new_path):
os.rename(os.path.join(new_path, file), os.path.join(old_path, file))

# delete any files in the CTTI_new folder
for file in os.listdir(new_path):
print(f"Deleting {file}")
os.remove(os.path.join(new_path, file))

# delete any files in the download folder
for file in os.listdir(download_path):
print(f"Deleting old {file}")
os.remove(os.path.join(download_path, file))

# Set Chrome options to configure headless mode and download path
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Configure Chrome to download files to the specified directory
prefs = {"download.default_directory": download_path}
chrome_options.add_experimental_option("prefs", prefs)

# Initialize the Chrome driver with headless options
driver = webdriver.Chrome(options=chrome_options)

# Go to the website
driver.get('https://aact.ctti-clinicaltrials.org/pipe_files')

# Wait until the dropdown is populated (wait up to 20 seconds)
wait = WebDriverWait(driver, 20)
dropdown = wait.until(EC.presence_of_element_located((By.XPATH, '//select[@class="form-select"]')))

# Create a Select object
select = Select(dropdown)

# Select the first option
select.select_by_index(1)

# Wait for the file to download (adjust as needed)
# check if .crdownload file is still being downloaded
time.sleep(10)
start = time.time()
while any(fname.endswith('.crdownload') for fname in os.listdir(download_path)):
time.sleep(10)
if time.time() - start > 600:
print("File download timed out")
break

# Close the driver
driver.quit()

print(f"File downloaded to: {download_path}")

#get the name of the downloaded file
file_name = os.listdir(download_path)[0]

with zipfile.ZipFile(os.path.join(download_path, file_name), 'r') as zip_ref:
zip_ref.extractall(new_path)

print(f"File extracted to: {new_path}")


2 changes: 1 addition & 1 deletion llm_prediction_on_pubmed/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ python extract_pubmed_abstracts.py --data_path <Path to CITI data> --NCBI_api_ke
- To make the process efficient, we initially retrieve the top 2 most relevant abstracts (as shown in the figure above) and save them in a data frame.

```jsx
python retrieve_top2_abstracts.py --data_path <Path to CITI data> --pubmed_path <Path to the extracted pubmed data>
python retrieve_top2_abstracts.py --data_path <Path to CITI data> --save_path <Path to save data>
```

The resultant data frame will be saved at /<pubmed_path>/top_2_extracted_pubmed_articles.csv
Expand Down
113 changes: 101 additions & 12 deletions llm_prediction_on_pubmed/retrieve_top2_abstracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,32 @@
- Number of publications before trial completion
'''

from hmac import new
import json
import glob
from turtle import update
from numpy import save
from torch import ne
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util,models
import os
import time
import pandas as pd
import argparse
from support_functions import extract_study_basic_info,filter_articles,extract_similar_pubmed_articles




def main(data_path,pubmed_path):
def main(data_path,save_path,dev = False):
# read extracted pubmed_articles
pubmed_path = os.path.join(save_path,'extracted_pubmed')
save_path = os.path.join(save_path,'llm_predict')




# read extracted pubmed articles
pubmed_files = glob.glob(os.path.join(pubmed_path,'extracted_pubmed','*_pubmed_abs.json'))
print(f"Total number of pubmed files: {len(pubmed_files)}")
embeddings = models.Transformer(
Expand All @@ -51,13 +61,26 @@ def main(data_path,pubmed_path):

# Extract top 2 similar articles based on trial title and abstract title and create a dataframe
new_rows = []
# read previous top 2 extracted pubmed articles csv in llm_prediction folder
top_2_exists = False
if os.path.exists(os.path.join(save_path,'top_2_extracted_pubmed_articles.csv')):
print('Reading previously extracted top 2 similar articles')
top_2_exists = True
top_2_prev_df = pd.read_csv(os.path.join(save_path,'top_2_extracted_pubmed_articles.csv'))
# convert to list of dictionaries
new_rows = top_2_prev_df.to_dict('records') # changed to list of dictionaries

updated_nct_id = []
new_nct_id =[]
# # read data frame and append the rows to new_rows
# pubmed_df = pd.read_csv('./top_2_extracted_pubmed_articles.csv')
# print(len(pubmed_df))
# for i in range(len(pubmed_df)):
# new_rows.append(pubmed_df.iloc[i].to_dict())


# for development mode
num = 0

for jsonfile in tqdm(pubmed_files):
nct_id = jsonfile.split('/')[-1].split('_')[0]
# if nct_id in pubmed_df['nct_id'].values:
Expand Down Expand Up @@ -97,32 +120,98 @@ def main(data_path,pubmed_path):
for article in pubmed_all_data['References']:
if article['Reference type'] == article_type:
row[article_type] += 1
new_rows.append(row)

# check if the PMID is in the new_rows
# check if the ncit_id exists in the new_rows
if top_2_exists:
nct_id_exists = False
for i in range(len(new_rows)):
if new_rows[i]['nct_id'] == nct_id:
prev_row = new_rows[i]
nct_id_exists = True
break
if nct_id_exists:
# check if the PMID exists in the new_rows
prev_pmid_list = []
for i in range(1,3):
prev_pmid_list.append(prev_row[f'top_{i}_similar_article_PMID'])
if row['top_1_similar_article_PMID'] not in prev_pmid_list or row['top_2_similar_article_PMID'] not in prev_pmid_list:

# delete the previous row from new_rows and append the new row
new_rows.remove(prev_row)
new_rows.append(row)
updated_nct_id.append(nct_id)
else:
continue
else:
new_rows.append(row)
new_nct_id.append(nct_id)
else:
new_rows.append(row)
new_nct_id.append(nct_id)

num += 1
# for development mode
if dev and num == 5000:
print('Development mode: break')
break

if len(new_rows) % 10000 == 0:
pubmed_df = pd.DataFrame(new_rows)
pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)
# if len(new_rows) % 10000 == 0:
# pubmed_df = pd.DataFrame(new_rows)
# pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)

pubmed_df = pd.DataFrame(new_rows)
pubmed_df.to_csv('./top_2_extracted_pubmed_articles.csv', index = False)

# log all updated nct_id with date to log file
if top_2_exists:
with open('./logs/pubmed_reference_logs.txt', 'a') as f:
f.write('====================\n')
f.write(f'Update time: {time.ctime()}\n')
f.write('Top 2 similar articles updated for the following nct_ids:\n')
f.write(f'Updated {len(updated_nct_id)} nct_id: {updated_nct_id}\n')
f.write('Following nct_ids are new:\n')
f.write(f'New {len(new_nct_id)} nct_id: {new_nct_id}\n')
f.close()
print(f'{time.ctime()}: Updated {len(updated_nct_id)} nct_id: {updated_nct_id}')
print(f'{time.ctime()}: New {len(new_nct_id)} nct_id: {new_nct_id}')
print('Top 2 similar articles updated')
else:
print('Top 2 similar articles extracted')
with open('./logs/pubmed_reference_logs.txt', 'a') as f:
f.write('====================\n')
f.write(f'Update time: {time.ctime()}\n')
f.write('Top 2 similar articles extracted\n')
f.write(f'Numeber of nct_ids: {len(new_nct_id)}\n')
f.close()
print(f'{time.ctime()}: New {len(new_nct_id)} nct_id: {new_nct_id}')
print('Top 2 similar articles extracted')

#
# # break


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default=None , help='Path to the CITI data folder')
parser.add_argument('--pubmed_path', type=str, default=None , help='Path to the extracted pubmed data')
# parser.add_argument('--pubmed_path', type=str, default=None , help='Path to the extracted pubmed data')
parser.add_argument('--save_path', type=str, default= None, help='Path to save the extracted data')
parser.add_argument('--dev', action='store_true', help='Run in development mode')

args = parser.parse_args()

if args.data_path is None:
raise ValueError('Please provide the path to the CITI data folder')
if args.pubmed_path is None:
raise ValueError('Please provide the path to the extracted pubmed data')
if args.save_path is None:
raise ValueError('Please provide the path to main folder where extracted pubmed data is saved')

data_path = args.data_path
pubmed_path = args.pubmed_path
# pubmed_path = args.pubmed_path
save_path = args.save_path

os.chdir(args.pubmed_path)
# os.chdir(args.pubmed_path)
os.chdir(args.save_path)

main(data_path,pubmed_path)
# main(data_path,pubmed_path)
main(data_path,save_path,args.dev)

20 changes: 14 additions & 6 deletions pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
DATA_PATH=/srv/local/data/jp65/CTO/raw_data/76a6hbbrw9v9fpi6oycfq7x6ofgx
NCBI_API_KEY=''
NCBI_API_KEY='558a8ec64b0df1607941d0261d0a5d273308'
SAVE_PATH=/srv/local/data/jp65/CTO


# Getting LLM predictions on Pubmed data
echo "Getting LLM predictions on Pubmed data"
cd llm_prediction_on_pubmed
# Downloading CTTI new data
echo "Downloading CTTI new data"
python download_ctti.py --save_path $SAVE_PATH

echo " Extracting and Updating Pubmed data"
python extract_pubmed_abstracts.py --data_path $DATA_PATH --NCBI_api_key $NCBI_API_KEY --save_path $SAVE_PATH --dev


# # Getting LLM predictions on Pubmed data
# echo "Getting LLM predictions on Pubmed data"
# cd llm_prediction_on_pubmed

# echo "Extracting and Updating Pubmed data"
# python extract_pubmed_abstracts.py --data_path $DATA_PATH --NCBI_api_key $NCBI_API_KEY --save_path $SAVE_PATH --dev
# echo
# python retrieve_top2_abstracts.py --data_path $DATA_PATH --save_path $SAVE_PATH --dev

0 comments on commit c36442d

Please sign in to comment.