Skip to content

Commit

Permalink
automating pipeline: extract_pubmed_abstracts
Browse files Browse the repository at this point in the history
  • Loading branch information
Jathurshan0330 committed Sep 15, 2024
1 parent 94ed728 commit ba7ea26
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 5 deletions.
44 changes: 39 additions & 5 deletions llm_prediction_on_pubmed/extract_pubmed_abstracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,17 @@ def get_all_data(article):
return article_data


def main(data_path,NCBI_api_key):
def main(data_path,NCBI_api_key, dev = False):
study_ref_path = os.path.join(data_path,'study_references.txt')

study_ref_file = open(study_ref_path, "r").read().split('\n')
# study_ref_file[-1]


# extract id nct_id, pmid, reference_type and citation and store in a dictionary
print('Extracting data from study_references.txt')
study_ref_dict = {'id':{},'nct_id':{},'pmid':{},'reference_type':{},'citation':{}} # create a dictionary with empty lists as values
for i in range(1,len(study_ref_file)-1):
for i in tqdm(range(1,len(study_ref_file)-1)):
study_ref_dict['id'][i-1] = study_ref_file[i].split('|')[0]
study_ref_dict['nct_id'][i-1] = study_ref_file[i].split('|')[1]
study_ref_dict['pmid'][i-1] = study_ref_file[i].split('|')[2]
Expand Down Expand Up @@ -88,6 +89,7 @@ def main(data_path,NCBI_api_key):
num = 0
total = len(nct_id_dict)
missed_nct_id = []
updated_nct_id = []
for k,v in tqdm(nct_id_dict.items()):

try:
Expand All @@ -98,11 +100,22 @@ def main(data_path,NCBI_api_key):

reference_list = []
#check whether the file exists
if os.path.exists(os.path.join('./extracted_pubmed',f'{nct_id}_pubmed_abs.json')):
continue
trial_ref_exists_in_data = False
if os.path.exists(os.path.join('./extracted_pubmed',f'{nct_id}_pubmed_abs.json')): # checking if the trial references scraped previously
existing_reference_dict = json.load(open(os.path.join('./extracted_pubmed',f'{nct_id}_pubmed_abs.json')))
reference_list = existing_reference_dict['References']
#get PMID from existing references
existing_pmids = [reference_list[i]['PMID'] for i in range(len(reference_list))]
trial_ref_exists_in_data = True
# continue

is_file_updated = False
for i in range(len(v)):
pmid= v[i][0]
if trial_ref_exists_in_data: # if the trial reference exists in the data, skip the reference
if pmid in existing_pmids:
continue
is_file_updated = True
ref_type = v[i][1]
# print(nct_id, pmid, ref_type)
text_path = './pubmed_data.txt'
Expand All @@ -118,19 +131,40 @@ def main(data_path,NCBI_api_key):
reference_data['References'] = reference_list
with open(os.path.join('./extracted_pubmed',f'{nct_id}_pubmed_abs.json'), 'w') as f:
json.dump(reference_data, f)
if is_file_updated:
updated_nct_id.append(k)
except:
missed_nct_id.append(k)
print(f'Error with {k}')
time.sleep(5)
continue

# for development mode
num += 1
if dev and num == 5000:
print('Development mode: break')
break

# log all updated nct_id with date to log file
if not os.path.exists('./logs'):
os.makedirs('./logs')
with open('./logs/pubmed_reference_logs.txt', 'a') as f:
f.write('====================\n')
f.write(f'Update time: {time.ctime()}\n')
f.write('Extracting pubmed abstracts\n')
f.write(f'Updated {len(updated_nct_id)} nct_id: {updated_nct_id}\n')
f.close()
print(f'{time.ctime()} - Updated {len(updated_nct_id)} nct_id: {updated_nct_id}')
print('Pubmed abstracts extraction completed')



if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default= None, help='Path to the CITI data folder')
parser.add_argument('--NCBI_api_key', type=str, default= None, help='NCBI API key')
parser.add_argument('--save_path', type=str, default= None, help='Path to save the extracted data')
parser.add_argument('--dev', action='store_true', help='Run in development mode')
args = parser.parse_args()

data_path = args.data_path
Expand All @@ -150,7 +184,7 @@ def main(data_path,NCBI_api_key):


print('Extracting PubMed abstracts')
main(data_path,NCBI_api_key)
main(data_path,NCBI_api_key, args.dev)
print('Done')


11 changes: 11 additions & 0 deletions pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
DATA_PATH=/srv/local/data/jp65/CTO/raw_data/76a6hbbrw9v9fpi6oycfq7x6ofgx
NCBI_API_KEY=''
SAVE_PATH=/srv/local/data/jp65/CTO


# Getting LLM predictions on Pubmed data
echo "Getting LLM predictions on Pubmed data"
cd llm_prediction_on_pubmed

echo " Extracting and Updating Pubmed data"
python extract_pubmed_abstracts.py --data_path $DATA_PATH --NCBI_api_key $NCBI_API_KEY --save_path $SAVE_PATH --dev

0 comments on commit ba7ea26

Please sign in to comment.