merge

chufangao · Oct 29, 2024 · de0d5ba · de0d5ba
2 parents e6679d1 + 2364368
commit de0d5ba
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 63 deletions.
diff --git a/labeling/lfs.py b/labeling/lfs.py
@@ -21,22 +21,22 @@ def reorder_columns(df, cols_in_front):
     return df[columns]
 
 def lf_results_reported(path):
-    df = pd.read_csv(path + 'calculated_values.txt', sep='|', low_memory=False)
+    df = pd.read_csv(path + 'calculated_values.txt.zip', sep='|', low_memory=False)
     df['lf'] = df['were_results_reported'] == 't'
     df['lf'] = df['lf'].astype('int')
     df = reorder_columns(df, ['nct_id', 'lf'])
     return df
 
 def lf_num_sponsors(path, quantile=.5):
-    df = pd.read_csv(path + 'sponsors.txt', sep='|',low_memory=False)
+    df = pd.read_csv(path + 'sponsors.txt.zip', sep='|',low_memory=False)
     df = df.groupby('nct_id')['name'].count().reset_index()
     df['lf'] = df['name'] > df['name'].quantile(quantile)
     df['lf'] = df['lf'].fillna(-1).astype('int')
     df = reorder_columns(df, ['nct_id', 'lf'])
     return df
 
 def lf_num_patients(path, quantile=.5):
-    df = pd.read_csv(path + 'outcome_counts.txt', sep='|', low_memory=False)    
+    df = pd.read_csv(path + 'outcome_counts.txt.zip', sep='|', low_memory=False)    
     df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, num_patients)
     df['lf'] = df['count'] > df['count'].quantile(quantile)
     df['lf'] = df['lf'].fillna(-1).astype('int')
@@ -45,7 +45,7 @@ def lf_num_patients(path, quantile=.5):
 
 def lf_patient_drop(path, quantile=.5):
     # patient dropout
-    df = pd.read_csv(os.path.join(path, 'drop_withdrawals.txt'), sep='|',low_memory=False)
+    df = pd.read_csv(os.path.join(path, 'drop_withdrawals.txt.zip'), sep='|',low_memory=False)
     df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, patient_drop)
     df['lf'] = df['count'] < df['count'].quantile(quantile)
     df['lf'] = df['lf'].fillna(-1).astype('int')
@@ -54,7 +54,7 @@ def lf_patient_drop(path, quantile=.5):
 
 def lf_sites(path, quantile=.5):
     # sites
-    df = pd.read_csv(os.path.join(path, 'facilities.txt'), sep='|',low_memory=False)
+    df = pd.read_csv(os.path.join(path, 'facilities.txt.zip'), sep='|',low_memory=False)
     df = df.groupby('nct_id')['name'].count().sort_values(ascending=False).reset_index()
     df = df.groupby('nct_id').mean().reset_index() # pd df (NCTID, values, sites)
     df['lf'] = df['name'] > df['name'].quantile(quantile)
@@ -64,8 +64,8 @@ def lf_sites(path, quantile=.5):
 
 def lf_pvalues(path, quantile=.5):
     # pvalues
-    df = pd.read_csv(os.path.join(path, 'outcome_analyses.txt'), sep='|', low_memory=False)
-    outcomes_df = pd.read_csv('../CTTI_20241017/outcomes.txt', sep='|')
+    df = pd.read_csv(os.path.join(path, 'outcome_analyses.txt.zip'), sep='|', low_memory=False)
+    outcomes_df = pd.read_csv(os.path.join(path, 'outcomes.txt.zip'), sep='|')
     primary_outcomes = outcomes_df[outcomes_df['outcome_type']=='PRIMARY']
     df = df[df['outcome_id'].isin(primary_outcomes['id'])]
 
@@ -77,7 +77,7 @@ def lf_pvalues(path, quantile=.5):
     return df
 
 def lf_update_more_recent(path, quantile=.5): #TODO clarify what this does
-    df = pd.read_csv(os.path.join(path, 'studies.txt'), sep='|', low_memory=False)
+    df = pd.read_csv(os.path.join(path, 'studies.txt.zip'), sep='|', low_memory=False)
     df['last_update_submitted_date'] = pd.to_datetime(df['last_update_submitted_date'])
     df['completion_date'] = pd.to_datetime(df['completion_date'])
     df['update_days'] = (df['last_update_submitted_date'] - df['completion_date']).dt.days
@@ -89,7 +89,7 @@ def lf_update_more_recent(path, quantile=.5): #TODO clarify what this does
     return df
 
 def lf_death_ae(path, quantile=.5):
-    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False)
+    df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False)
     df = df[df['event_type'] == 'deaths'].fillna(0)
     df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
     df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
@@ -98,7 +98,7 @@ def lf_death_ae(path, quantile=.5):
     return df
 
 def lf_serious_ae(path, quantile=.5):
-    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False)
+    df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False)
     df = df[df['event_type'] == 'serious'].fillna(0)
     df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
     df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
@@ -107,18 +107,20 @@ def lf_serious_ae(path, quantile=.5):
     return df
 
 def lf_all_ae(path, quantile=.5):
-    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|', low_memory=False).fillna(0)
+    df = pd.read_csv(path+'reported_event_totals.txt.zip', sep = '|', low_memory=False).fillna(0)
     df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
     df['lf'] = df['subjects_affected'] <= df['subjects_affected'].quantile(quantile)
     df['lf'] = df['lf'].fillna(-1).astype('int')
     df = reorder_columns(df, ['nct_id', 'lf'])
     return df
 
 def lf_status(path):
-    df = pd.read_csv(path+'studies.txt', sep='|', low_memory=False)
+    df = pd.read_csv(path+'studies.txt.zip', sep='|', low_memory=False)
     df['lf'] = -1
-    df.loc[df['overall_status'].isin(['Terminated', 'Withdrawn', 'Suspended', 'Withheld', 'No longer available', 'Temporarily not available']),['lf']] = 0
-    df.loc[df['overall_status'].isin(['Approved for marketing']),['lf']] = 1
+    # lower case all status and replace '_' with ' '
+    df['overall_status'] = df['overall_status'].str.lower().str.replace('_', ' ')
+    df.loc[df['overall_status'].isin(['terminated', 'withdrawn', 'suspended', 'withheld', 'no longer available', 'temporarily not available']),['lf']] = 0
+    df.loc[df['overall_status'].isin(['approved for marketing']),['lf']] = 1
     df['lf'] = df['lf'].fillna(-1).astype('int')
     df = reorder_columns(df, ['nct_id', 'lf'])
     return df
@@ -324,10 +326,10 @@ def get_lfs(lf_each_thresh_path,
     hint.rename(columns={'nctid': 'nct_id'}, inplace=True)
     print(f"hint['label'].value_counts() {hint['label'].value_counts()}")
 
-    study_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'studies.txt'), sep='|', low_memory=False)
+    study_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'studies.txt.zip'), sep='|', low_memory=False)
     study_df.dropna(subset=['phase'], inplace=True)
 
-    intervention_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'interventions.txt'), sep='|', low_memory=False)
+    intervention_df = pd.read_csv(os.path.join(args.CTTI_PATH, 'interventions.txt.zip'), sep='|', low_memory=False)
     intervention_df = intervention_df[intervention_df['intervention_type'].isin(['Drug', 'Biological'])]
     study_df = study_df[study_df['nct_id'].isin(set(intervention_df['nct_id']))]
 

diff --git a/news_headlines/get_news2.py b/news_headlines/get_news2.py
@@ -1,15 +1,12 @@
 import os
 import sys
-from tqdm.auto import tqdm, trange
+from tqdm import tqdm, trange
 from datetime import datetime, timedelta
 import time
 import pandas as pd
-import numpy as np
+import random
 import json
-import torch
 import argparse
-from transformers import pipeline
-from sentence_transformers import SentenceTransformer, CrossEncoder
 from gnews import GNews
 # # append GNews to path, append the path to the GNews folder, in this case, the GNews folder is in the directory of the script
 # sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'GNews/'))
@@ -29,6 +26,7 @@
 ]
 
 def getNewsData(query):
+# getNewsData('NCT04994509 clinical trial')
     headers = {
         'User-Agent': random.choice(USERAGENT_LIST)
     }
@@ -51,32 +49,30 @@ def getNewsData(query):
     # print(json.dumps(news_results, indent=2))
     return news_results
 
-# getNewsData('NCT04994509 clinical trial')
-
 
-def get_top_sponsors(sponsors, studies):
-    """
-    Get the top 1000 most popular phase 3 industry sponsors
+# def get_top_sponsors(sponsors, studies):
+#     """
+#     Get the top 1000 most popular phase 3 industry sponsors
 
-    sponsors: pd.DataFrame, sponsors.txt
-    studies: pd.DataFrame, studies.txt
-
-    Returns: pd.DataFrame, top 1000 most popular phase 3 industry sponsors
-    """
-    # sponsors = pd.read_csv(args.CTTI_PATH + './CTTI/sponsors.txt', sep='|')
-    # studies = pd.read_csv(args.CTTI_PATH + './CTTI/studies.txt', sep='|', low_memory=False)
-    studies['study_first_submitted_date'] = pd.to_datetime(studies['study_first_submitted_date'])
-    sponsors = pd.merge(sponsors, studies[['nct_id', 'phase', 'study_first_submitted_date']], on='nct_id', how='left')
-    sponsors = sponsors[sponsors['agency_class']=='INDUSTRY']
-    sponsors.dropna(inplace=True)
-    sponsors = sponsors[sponsors['phase'].str.contains('Phase 3')]
-    top_sponsors = sponsors['name'].value_counts().head(1000)
-    # coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
-    # print(coverage_) # 0.8548555767913166
-    combined = pd.merge(top_sponsors.reset_index(),
-                        sponsors.groupby('name')['study_first_submitted_date'].min().reset_index(),
-                        on='name', how='left')
-    return combined
+#     sponsors: pd.DataFrame, sponsors.txt.zip
+#     studies: pd.DataFrame, studies.txt.zip
+
+#     Returns: pd.DataFrame, top 1000 most popular phase 3 industry sponsors
+#     """
+#     # sponsors = pd.read_csv(args.CTTI_PATH + './CTTI/sponsors.txt.zip', sep='|')
+#     # studies = pd.read_csv(args.CTTI_PATH + './CTTI/studies.txt.zip', sep='|', low_memory=False)
+#     studies['study_first_submitted_date'] = pd.to_datetime(studies['study_first_submitted_date'])
+#     sponsors = pd.merge(sponsors, studies[['nct_id', 'phase', 'study_first_submitted_date']], on='nct_id', how='left')
+#     sponsors = sponsors[sponsors['agency_class']=='INDUSTRY']
+#     sponsors.dropna(inplace=True)
+#     sponsors = sponsors[sponsors['phase'].str.contains('Phase 3')]
+#     top_sponsors = sponsors['name'].value_counts().head(1000)
+#     # coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
+#     # print(coverage_) # 0.8548555767913166
+#     combined = pd.merge(top_sponsors.reset_index(),
+#                         sponsors.groupby('name')['study_first_submitted_date'].min().reset_index(),
+#                         on='name', how='left')
+#     # return combined
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -91,40 +87,48 @@ def get_top_sponsors(sponsors, studies):
     parser.add_argument('--SAVE_STUDY_TITLE_EMBEDDING_PATH', type=str, default='./studies_title2_embeddings.npy')
     parser.add_argument('--SAVE_NEWS_PATH', type=str, default='./news.csv')
     parser.add_argument('--SAVE_STUDY_NEWS_PATH', type=str, default='./studies_with_news.csv')
+    parser.add_argument('--NCT_IDS_TO_PROCESS', type=str, default=None)
     args = parser.parse_args()
     print(args)
     assert args.mode in ['get_news', 'process_news', 'correspond_news_and_studies']
 
     print(f'args.mode: {args.mode}')
-
-    continue_from_prev_log = args.continue_from_prev_log
-    studies = pd.read_csv(args.CTTI_PATH + 'studies.txt', sep='|', low_memory=False)
-    interventions = pd.read_csv(args.CTTI_PATH + 'interventions.txt', sep='|', low_memory=False)
+    print('Loading CTTI data')
+    studies = pd.read_csv(args.CTTI_PATH + 'studies.txt.zip', sep='|', low_memory=False)
+    interventions = pd.read_csv(args.CTTI_PATH + 'interventions.txt.zip', sep='|', low_memory=False)
+    print('Loaded CTTI data')
+
     interventions = interventions[interventions['intervention_type'].isin(['DRUG', 'BIOLOGICAL'])]
     studies = studies[studies['nct_id'].isin(interventions['nct_id'])]
+    studies = studies[studies['overall_status']=='COMPLETED']
     studies.dropna(subset=['phase'], inplace=True)
     studies = studies[studies['phase'].str.contains('1') | studies['phase'].str.contains('2') | studies['phase'].str.contains('3')]
 
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    sentiment_model = pipeline("text-classification", model=args.SENTIMENT_MODEL, device=device)
-    encoder = SentenceTransformer(args.SENTENCE_ENCODER)
-    crossencoder = CrossEncoder(args.SENTENCE_CROSSENCODER, max_length=512)
-
+    if args.NCT_IDS_TO_PROCESS is not None:
+        nct_ids_to_process = pd.read_csv(args.NCT_IDS_TO_PROCESS)
+        studies = studies[studies['nct_id'].isin(nct_ids_to_process['nct_id'])]
+    print(f'Processing {studies.shape[0]} studies')
+    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # sentiment_model = pipeline("text-classification", model=args.SENTIMENT_MODEL, device=device)
+    # encoder = SentenceTransformer(args.SENTENCE_ENCODER)
+    # crossencoder = CrossEncoder(args.SENTENCE_CROSSENCODER, max_length=512)
+    print('Loading GNews')
     google_news = GNews()
-
+    print('Loaded GNews, ready to get news')
     if args.mode == 'get_news': # warning: this will take a long time (multiple weeks)
         for nct_id in tqdm(studies['nct_id']):
-            time.sleep(np.random.uniform(1, 5))
+            # print(f'Getting news for {nct_id}')
+            time.sleep(random.random()*5+4)
             if os.path.exists(os.path.join(args.SAVE_NEWS_LOG_PATH, nct_id+".json")):
-                print(f'{nct_id} already exists')
+                # print(f'{nct_id} already exists')
                 with open(os.path.join(args.SAVE_NEWS_LOG_PATH, nct_id+".json"), 'rb') as f:
                     news = json.load(f)
                 if len(news) > 0:
                     print(f'Loaded {len(news)} news for {nct_id}')
                     continue
 
-            # news = google_news.get_news(f"{nct_id} clinical trial")
-            news = getNewsData(f"{nct_id} clinical trial")
+            news = google_news.get_news(f"{nct_id} clinical trial")
+            # news = getNewsData(f"{nct_id} clinical trial")
             if news is None:
                 news = []
             if len(news) > 0:
@@ -136,6 +140,8 @@ def get_top_sponsors(sponsors, studies):
     # # ======================== Process the news data ========================
     # elif args.mode == 'process_news':
     #     print('Processing news data')
+    # from transformers import pipeline
+    # from sentence_transformers import SentenceTransformer, CrossEncoder
 
     #     all_company_dfs = []
     #     for company in sorted(os.listdir(args.SAVE_NEWS_LOG_PATH)):
@@ -183,11 +189,13 @@ def get_top_sponsors(sponsors, studies):
     #     all_company_dfs.to_csv(args.SAVE_NEWS_PATH, index=False)
 
     # elif args.mode == 'correspond_news_and_studies':
+        # from transformers import pipeline
+        # from sentence_transformers import SentenceTransformer, CrossEncoder                
     #     news_df = pd.read_csv(args.SAVE_NEWS_PATH)
     #     news_title_embedding = np.load(args.SAVE_NEWS_EMBEDDING_PATH)
     #     top_sponsors = combined
-    #     interventions = pd.read_csv(args.CTTI_PATH+'interventions.txt', sep='|')
-    #     conditions = pd.read_csv(args.CTTI_PATH+'conditions.txt', sep='|')
+    #     interventions = pd.read_csv(args.CTTI_PATH+'interventions.txt.zip', sep='|')
+    #     conditions = pd.read_csv(args.CTTI_PATH+'conditions.txt.zip', sep='|')
 
     #     studies = studies[studies['nct_id'].isin(top_sponsors['nct_id'])]
     #     studies = studies[studies['nct_id'].isin(interventions['nct_id'])]

diff --git a/update_news.sh b/update_news.sh
@@ -1,3 +1,4 @@
+#!bin/bash
 # get_news paths
 
 # HF_HOME="/srv/local/data/chufan2/huggingface/"
@@ -10,14 +11,14 @@ SAVE_NEWS_EMBEDDING_PATH="./news_headlines/news_title_embeddings.npy"
 SAVE_STUDY_TITLE_EMBEDDING_PATH="./news_headlines/studies_title_embeddings.npy"
 SAVE_NEWS_PATH="./news_headlines/news.csv"
 SAVE_STUDY_NEWS_PATH="./news_headlines/studies_with_news.csv"
+NCT_IDS_TO_PROCESS="./manual_labels/nctids.csv"
 
 # SAVE_NEWS_LOG_PATH="./supplementary/news_headlines/news_logs"
 # SAVE_NEWS_EMBEDDING_PATH="./supplementary/news_headlines/news_title_embeddings.npy"
 # SAVE_STUDY_TITLE_EMBEDDING_PATH="./supplementary/news_headlines/studies_title_embeddings.npy"
 # SAVE_NEWS_PATH="./supplementary/news_headlines/news.csv"
 # SAVE_STUDY_NEWS_PATH="./supplementary/news_headlines/studies_with_news.csv"
 
-# python news_headlines/get_news2.py --mode=get_news --continue_from_prev_log=$continue_from_prev_log --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
-python news_headlines/get_news2.py --mode=get_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
+python news_headlines/get_news2.py --mode=get_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH --NCT_IDS_TO_PROCESS=$NCT_IDS_TO_PROCESS
 # python news_headlines/get_news.py --mode=process_news --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH
 # python news_headlines/get_news.py --mode=correspond_news_and_studies --CTTI_PATH=$CTTI_PATH --SENTIMENT_MODEL=$SENTIMENT_MODEL --SENTENCE_ENCODER=$SENTENCE_ENCODER --SAVE_NEWS_LOG_PATH=$SAVE_NEWS_LOG_PATH --SAVE_NEWS_EMBEDDING_PATH=$SAVE_NEWS_EMBEDDING_PATH --SAVE_NEWS_PATH=$SAVE_NEWS_PATH --SAVE_STUDY_NEWS_PATH=$SAVE_STUDY_NEWS_PATH