Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Jathurshan0330 committed Oct 29, 2024
2 parents e6679d1 + 2364368 commit de0d5ba
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 63 deletions.
34 changes: 18 additions & 16 deletions labeling/lfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,22 @@ def reorder_columns(df, cols_in_front):
return df[columns]

def lf_results_reported(path):
df = pd.read_csv(path + 'calculated_values.txt', sep='|', low_memory=False)
df = pd.read_csv(path + 'calculated_values.txt.zip', sep='|', low_memory=False)
df['lf'] = df['were_results_reported'] == 't'
df['lf'] = df['lf'].astype('int')
df = reorder_columns(df, ['nct_id', 'lf'])
return df

def lf_num_sponsors(path, quantile=.5):
df = pd.read_csv(path + 'sponsors.txt', sep='|',low_memory=False)
df = pd.read_csv(path + 'sponsors.txt.zip', sep='|',low_memory=False)
df = df.groupby('nct_id')['name'].count().reset_index()
df['lf'] = df['name'] > df['name'].quantile(quantile)
df['lf'] = df['lf'].fillna(-1).astype('int')
df = reorder_columns(df, ['nct_id', 'lf'])
return df

def lf_num_patients(path, quantile=.5):
df = pd.read_csv(path + 'outcome_counts.txt', sep='|', low_memory=False)
df = pd.read_csv(path + 'outcome_counts.txt.zip', sep='|', low_memory=False)
df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, num_patients)
df['lf'] = df['count'] > df['count'].quantile(quantile)
df['lf'] = df['lf'].fillna(-1).astype('int')
Expand All @@ -45,7 +45,7 @@ def lf_num_patients(path, quantile=.5):

def lf_patient_drop(path, quantile=.5):
# patient dropout
df = pd.read_csv(os.path.join(path, 'drop_withdrawals.txt'), sep='|',low_memory=False)
df = pd.read_csv(os.path.join(path, 'drop_withdrawals.txt.zip'), sep='|',low_memory=False)
df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, patient_drop)
df['lf'] = df['count'] < df['count'].quantile(quantile)
df['lf'] = df['lf'].fillna(-1).astype('int')
Expand All @@ -54,7 +54,7 @@ def lf_patient_drop(path, quantile=.5):

def lf_sites(path, quantile=.5):
# sites
df = pd.read_csv(os.path.join(path, 'facilities.txt'), sep='|',low_memory=False)
df = pd.read_csv(os.path.join(path, 'facilities.txt.zip'), sep='|',low_memory=False)
df = df.groupby('nct_id')['name'].count().sort_values(ascending=False).reset_index()
df = df.groupby('nct_id').mean().reset_index() # pd df (NCTID, values, sites)
df['lf'] = df['name'] > df['name'].quantile(quantile)
Expand All @@ -64,8 +64,8 @@ def lf_sites(path, quantile=.5):

def lf_pvalues(path, quantile=.5):
# pvalues
df = pd.read_csv(os.path.join(path, 'outcome_analyses.txt'), sep='|', low_memory=False)
outcomes_df = pd.read_csv('../CTTI_20241017/outcomes.txt', sep='|')
df = pd.read_csv(os.path.join(path, 'outcome_analyses.txt.zip'), sep='|', low_memory=False)
outcomes_df = pd.read_csv(os.path.join(path, 'outcomes.txt.zip'), sep='|')
primary_outcomes = outcomes_df[outcomes_df['outcome_type']=='PRIMARY']
df = df[df['outcome_id'].isin(primary_outcomes['id'])]

Expand All @@ -77,7 +77,7 @@ def lf_pvalues(path, quantile=.5):
return df

def lf_update_more_recent(path, quantile=.5): #TODO clarify what this does
df = pd.read_csv(os.path.join(path, 'studies.txt'), sep='|', low_memory=False)
df = pd.read_csv(os.path.join(path, 'studies.txt.zip'), sep='|', low_memory=False)
df['last_update_submitted_date'] = pd.to_datetime(df['last_update_submitted_date'])
df['completion_date'] = pd.to_datetime(df['completion_date'])
df['update_days'] = (df['last_update_submitted_date'] - df['completion_date']).dt.days
Expand All @@ -89,7 +89,7 @@ def lf_update_more_recent(path, quantile=.5): #TODO clarify what this does
return df

def lf_death_ae(path, quantile=.5):
df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False)
df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False)
df = df[df['event_type'] == 'deaths'].fillna(0)
df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
Expand All @@ -98,7 +98,7 @@ def lf_death_ae(path, quantile=.5):
return df

def lf_serious_ae(path, quantile=.5):
df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False)
df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False)
df = df[df['event_type'] == 'serious'].fillna(0)
df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
Expand All @@ -107,18 +107,20 @@ def lf_serious_ae(path, quantile=.5):
return df

def lf_all_ae(path, quantile=.5):
df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False).fillna(0)
df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False).fillna(0)
df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
df['lf'] = df['lf'].fillna(-1).astype('int')
df = reorder_columns(df, ['nct_id', 'lf'])
return df

def lf_status(path):
df = pd.read_csv(path+'studies.txt', sep='|', low_memory=False)
df = pd.read_csv(path+'studies.txt.zip', sep='|', low_memory=False)
df['lf'] = -1
df.loc[df['overall_status'].isin(['Terminated', 'Withdrawn', 'Suspended', 'Withheld', 'No longer available', 'Temporarily not available']),['lf']] = 0
df.loc[df['overall_status'].isin(['Approved for marketing']),['lf']] = 1
# lower case all status and replace '_' with ' '
df['overall_status'] = df['overall_status'].str.lower().str.replace('_', ' ')
df.loc[df['overall_status'].isin(['terminated', 'withdrawn', 'suspended', 'withheld', 'no longer available', 'temporarily not available']),['lf']] = 0
df.loc[df['overall_status'].isin(['approved for marketing']),['lf']] = 1
df['lf'] = df['lf'].fillna(-1).astype('int')
df = reorder_columns(df, ['nct_id', 'lf'])
return df
Expand Down Expand Up @@ -324,10 +326,10 @@ def get_lfs(lf_each_thresh_path,
hint.rename(columns={'nctid': 'nct_id'}, inplace=True)
print(f"hint['label'].value_counts() {hint['label'].value_counts()}")

study_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'studies.txt'), sep='|', low_memory=False)
study_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'studies.txt.zip'), sep='|', low_memory=False)
study_df.dropna(subset=['phase'], inplace=True)

intervention_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'interventions.txt'), sep='|', low_memory=False)
intervention_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'interventions.txt.zip'), sep='|', low_memory=False)
intervention_df = intervention_df[intervention_df['intervention_type'].isin(['Drug', 'Biological'])]
study_df = study_df[study_df['nct_id'].isin(set(intervention_df['nct_id']))]

Expand Down
98 changes: 53 additions & 45 deletions news_headlines/get_news2.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import os
import sys
from tqdm.auto import tqdm, trange
from tqdm import tqdm, trange
from datetime import datetime, timedelta
import time
import pandas as pd
import numpy as np
import random
import json
import torch
import argparse
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder
from gnews import GNews
# # append GNews to path, append the path to the GNews folder, in this case, the GNews folder is in the directory of the script
# sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'GNews/'))
Expand All @@ -29,6 +26,7 @@
]

def getNewsData(query):
# getNewsData('NCT04994509 clinical trial')
headers = {
'User-Agent': random.choice(USERAGENT_LIST)
}
Expand All @@ -51,32 +49,30 @@ def getNewsData(query):
# print(json.dumps(news_results, indent=2))
return news_results

# getNewsData('NCT04994509 clinical trial')


def get_top_sponsors(sponsors, studies):
"""
Get the top 1000 most popular phase 3 industry sponsors
# def get_top_sponsors(sponsors, studies):
# """
# Get the top 1000 most popular phase 3 industry sponsors

sponsors: pd.DataFrame, sponsors.txt
studies: pd.DataFrame, studies.txt
Returns: pd.DataFrame, top 1000 most popular phase 3 industry sponsors
"""
# sponsors = pd.read_csv(args.CTTI_PATH + './CTTI/sponsors.txt', sep='|')
# studies = pd.read_csv(args.CTTI_PATH + './CTTI/studies.txt', sep='|', low_memory=False)
studies['study_first_submitted_date'] = pd.to_datetime(studies['study_first_submitted_date'])
sponsors = pd.merge(sponsors, studies[['nct_id', 'phase', 'study_first_submitted_date']], on='nct_id', how='left')
sponsors = sponsors[sponsors['agency_class']=='INDUSTRY']
sponsors.dropna(inplace=True)
sponsors = sponsors[sponsors['phase'].str.contains('Phase 3')]
top_sponsors = sponsors['name'].value_counts().head(1000)
# coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
# print(coverage_) # 0.8548555767913166
combined = pd.merge(top_sponsors.reset_index(),
sponsors.groupby('name')['study_first_submitted_date'].min().reset_index(),
on='name', how='left')
return combined
# sponsors: pd.DataFrame, sponsors.txt.zip
# studies: pd.DataFrame, studies.txt.zip

# Returns: pd.DataFrame, top 1000 most popular phase 3 industry sponsors
# """
# # sponsors = pd.read_csv(args.CTTI_PATH + './CTTI/sponsors.txt.zip', sep='|')
# # studies = pd.read_csv(args.CTTI_PATH + './CTTI/studies.txt.zip', sep='|', low_memory=False)
# studies['study_first_submitted_date'] = pd.to_datetime(studies['study_first_submitted_date'])
# sponsors = pd.merge(sponsors, studies[['nct_id', 'phase', 'study_first_submitted_date']], on='nct_id', how='left')
# sponsors = sponsors[sponsors['agency_class']=='INDUSTRY']
# sponsors.dropna(inplace=True)
# sponsors = sponsors[sponsors['phase'].str.contains('Phase 3')]
# top_sponsors = sponsors['name'].value_counts().head(1000)
# # coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
# # print(coverage_) # 0.8548555767913166
# combined = pd.merge(top_sponsors.reset_index(),
# sponsors.groupby('name')['study_first_submitted_date'].min().reset_index(),
# on='name', how='left')
# # return combined

if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand All @@ -91,40 +87,48 @@ def get_top_sponsors(sponsors, studies):
parser.add_argument('--SAVE_STUDY_TITLE_EMBEDDING_PATH', type=str, default='./studies_title2_embeddings.npy')
parser.add_argument('--SAVE_NEWS_PATH', type=str, default='./news.csv')
parser.add_argument('--SAVE_STUDY_NEWS_PATH', type=str, default='./studies_with_news.csv')
parser.add_argument('--NCT_IDS_TO_PROCESS', type=str, default=None)
args = parser.parse_args()
print(args)
assert args.mode in ['get_news', 'process_news', 'correspond_news_and_studies']

print(f'args.mode: {args.mode}')

continue_from_prev_log = args.continue_from_prev_log
studies = pd.read_csv(args.CTTI_PATH + 'studies.txt', sep='|', low_memory=False)
interventions = pd.read_csv(args.CTTI_PATH + 'interventions.txt', sep='|', low_memory=False)
print('Loading CTTI data')
studies = pd.read_csv(args.CTTI_PATH + 'studies.txt.zip', sep='|', low_memory=False)
interventions = pd.read_csv(args.CTTI_PATH + 'interventions.txt.zip', sep='|', low_memory=False)
print('Loaded CTTI data')

interventions = interventions[interventions['intervention_type'].isin(['DRUG', 'BIOLOGICAL'])]
studies = studies[studies['nct_id'].isin(interventions['nct_id'])]
studies = studies[studies['overall_status']=='COMPLETED']
studies.dropna(subset=['phase'], inplace=True)
studies = studies[studies['phase'].str.contains('1') | studies['phase'].str.contains('2') | studies['phase'].str.contains('3')]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
sentiment_model = pipeline("text-classification", model=args.SENTIMENT_MODEL, device=device)
encoder = SentenceTransformer(args.SENTENCE_ENCODER)
crossencoder = CrossEncoder(args.SENTENCE_CROSSENCODER, max_length=512)

if args.NCT_IDS_TO_PROCESS is not None:
nct_ids_to_process = pd.read_csv(args.NCT_IDS_TO_PROCESS)
studies = studies[studies['nct_id'].isin(nct_ids_to_process['nct_id'])]
print(f'Processing {studies.shape[0]} studies')
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# sentiment_model = pipeline("text-classification", model=args.SENTIMENT_MODEL, device=device)
# encoder = SentenceTransformer(args.SENTENCE_ENCODER)
# crossencoder = CrossEncoder(args.SENTENCE_CROSSENCODER, max_length=512)
print('Loading GNews')
google_news = GNews()

print('Loaded GNews, ready to get news')
if args.mode == 'get_news': # warning: this will take a long time (multiple weeks)
for nct_id in tqdm(studies['nct_id']):
time.sleep(np.random.uniform(1, 5))
# print(f'Getting news for {nct_id}')
time.sleep(random.random()*5+4)
if os.path.exists(os.path.join(args.SAVE_NEWS_LOG_PATH, nct_id+".json")):
print(f'{nct_id} already exists')
# print(f'{nct_id} already exists')
with open(os.path.join(args.SAVE_NEWS_LOG_PATH, nct_id+".json"), 'rb') as f:
news = json.load(f)
if len(news) > 0:
print(f'Loaded {len(news)} news for {nct_id}')
continue

# news = google_news.get_news(f"{nct_id} clinical trial")
news = getNewsData(f"{nct_id} clinical trial")
news = google_news.get_news(f"{nct_id} clinical trial")
# news = getNewsData(f"{nct_id} clinical trial")
if news is None:
news = []
if len(news) > 0:
Expand All @@ -136,6 +140,8 @@ def get_top_sponsors(sponsors, studies):
# # ======================== Process the news data ========================
# elif args.mode == 'process_news':
# print('Processing news data')
# from transformers import pipeline
# from sentence_transformers import SentenceTransformer, CrossEncoder

# all_company_dfs = []
# for company in sorted(os.listdir(args.SAVE_NEWS_LOG_PATH)):
Expand Down Expand Up @@ -183,11 +189,13 @@ def get_top_sponsors(sponsors, studies):
# all_company_dfs.to_csv(args.SAVE_NEWS_PATH, index=False)

# elif args.mode == 'correspond_news_and_studies':
# from transformers import pipeline
# from sentence_transformers import SentenceTransformer, CrossEncoder
# news_df = pd.read_csv(args.SAVE_NEWS_PATH)
# news_title_embedding = np.load(args.SAVE_NEWS_EMBEDDING_PATH)
# top_sponsors = combined
# interventions = pd.read_csv(args.CTTI_PATH+'interventions.txt', sep='|')
# conditions = pd.read_csv(args.CTTI_PATH+'conditions.txt', sep='|')
# interventions = pd.read_csv(args.CTTI_PATH+'interventions.txt.zip', sep='|')
# conditions = pd.read_csv(args.CTTI_PATH+'conditions.txt.zip', sep='|')

# studies = studies[studies['nct_id'].isin(top_sponsors['nct_id'])]
# studies = studies[studies['nct_id'].isin(interventions['nct_id'])]
Expand Down
5 changes: 3 additions & 2 deletions update_news.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!bin/bash
# get_news paths

# HF_HOME="/srv/local/data/chufan2/huggingface/"
Expand All @@ -10,14 +11,14 @@ SAVE_NEWS_EMBEDDING_PATH="./news_headlines/news_title_embeddings.npy"
SAVE_STUDY_TITLE_EMBEDDING_PATH="./news_headlines/studies_title_embeddings.npy"
SAVE_NEWS_PATH="./news_headlines/news.csv"
SAVE_STUDY_NEWS_PATH="./news_headlines/studies_with_news.csv"
NCT_IDS_TO_PROCESS="./manual_labels/nctids.csv"

# SAVE_NEWS_LOG_PATH="./supplementary/news_headlines/news_logs"
# SAVE_NEWS_EMBEDDING_PATH="./supplementary/news_headlines/news_title_embeddings.npy"
# SAVE_STUDY_TITLE_EMBEDDING_PATH="./supplementary/news_headlines/studies_title_embeddings.npy"
# SAVE_NEWS_PATH="./supplementary/news_headlines/news.csv"
# SAVE_STUDY_NEWS_PATH="./supplementary/news_headlines/studies_with_news.csv"

# python news_headlines/get_news2.py --mode=get_news --continue_from_prev_log=$continue_from_prev_log --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
python news_headlines/get_news2.py --mode=get_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
python news_headlines/get_news2.py --mode=get_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH --NCT_IDS_TO_PROCESS=$NCT_IDS_TO_PROCESS
# python news_headlines/get_news.py --mode=process_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
# python news_headlines/get_news.py --mode=correspond_news_and_studies --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH

0 comments on commit de0d5ba

Please sign in to comment.