-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilar_work.py
185 lines (141 loc) · 9.2 KB
/
similar_work.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
IMPORTS
"""
import numpy as np
import pandas as pd
import requests
import re
import io
import time
from datetime import datetime as dt
"""
STUDY FIELDS TO QUERY
"""
# Set a search expression. Spaces are denoted as '+'. Boolean logic applies.
search_terms = """autism+OR+autism+spectrum+disorder+OR+Fragile+X+OR+Rett+syndrome+OR+tuberous+sclerosis+OR+Williams+syndrome+OR+
Praeder+Willi+syndrome+OR+Phelan+McDermid+syndrome+OR+Dup15q+OR+Angelman+OR+Timothy+syndrome+OR+16p+deletion+OR+16p+duplication+OR+ADNP"""
# Set study fields to query in the FDA's API. Each of these fields is a column in the final DataFrame.
search_fields_1 = ("""NCTId,Acronym,ArmGroupDescription,ArmGroupInterventionName,ArmGroupLabel,ArmGroupType,BriefSummary,BriefTitle,CentralContactEMail,CentralContactName,
CompletionDate,CompletionDateType,Condition,ConditionBrowseLeafAsFound,DesignAllocation,DesignInterventionModel,DesignMasking,DesignPrimaryPurpose,DesignWhoMasked""").replace(',','%2C')
search_fields_2 = ("""DetailedDescription,EligibilityCriteria,EnrollmentCount,EnrollmentType,Gender,HealthyVolunteers,
IPDSharing,InterventionArmGroupLabel,InterventionDescription,InterventionName,InterventionType,LastUpdatePostDate,LastUpdatePostDateType,LastUpdateSubmitDate,
LeadSponsorClass,LeadSponsorName,LocationCity,LocationCountry,LocationFacility""").replace(',','%2C')
search_fields_3 = ("""LocationState,MaximumAge,MinimumAge,OfficialTitle,OrgFullName,OverallOfficialAffiliation,OverallStatus,
OversightHasDMC,Phase,PrimaryCompletionDate,PrimaryCompletionDateType,PrimaryOutcomeDescription,PrimaryOutcomeMeasure,PrimaryOutcomeTimeFrame,
ReferenceCitation,ResponsiblePartyType,
ResultsFirstPostDate""").replace(',','%2C')
search_fields_4 = ("""ResultsFirstPostDateType,ResultsFirstSubmitDate,ResultsFirstSubmitQCDate,SecondaryOutcomeDescription,SecondaryOutcomeMeasure,SecondaryOutcomeTimeFrame,
StartDate,StartDateType,StatusVerifiedDate,StdAge,StudyFirstPostDate,StudyFirstPostDateType,StudyFirstSubmitDate,StudyFirstSubmitQCDate,StudyType,VersionHolder,WhyStopped""").replace(',','%2C')
# Set headers and format_type. CSV is used here, but this could also be changed to JSON.
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
format_type = 'csv'
"""
FUNCTIONS
"""
def compile_df(min_rank, max_rank):
"""
This function executes the requests to the clinical trials API. It assembles the 'URLs' to query, saves requests to variables,
and then builds dataframes to store the data. The FDA's API is limited in several respects; one can only query 20 study fields at a time.
Additionally, one can only pull data on 1000 trials per request. Thus, DataFrames are concatenated in both dimensions to assemble the final
unfiltered dataset.
INPUTS:
min_rank (int): The first row, in the returned dataset, to query.
max_rank (int): The maximum row, in the returned dataset, to query.
OUTPUTS:
df1, df2, df3, df4 (df): Four Pandas DataFrames; one for each 'search_fields' string.
"""
# Assemble the URLs to query. The Clinical Trials API uses these endpoints, and we just plug in the search_terms, search_fields, and the slice of rows to query.
base_url_1 = f"https://clinicaltrials.gov/api/query/study_fields?expr={search_terms}&fields={search_fields_1}&min_rnk={min_rank}&max_rnk={max_rank}&fmt={format_type}"
base_url_2 = f"https://clinicaltrials.gov/api/query/study_fields?expr={search_terms}&fields={search_fields_2}&min_rnk={min_rank}&max_rnk={max_rank}&fmt={format_type}"
base_url_3 = f"https://clinicaltrials.gov/api/query/study_fields?expr={search_terms}&fields={search_fields_3}&min_rnk={min_rank}&max_rnk={max_rank}&fmt={format_type}"
base_url_4 = f"https://clinicaltrials.gov/api/query/study_fields?expr={search_terms}&fields={search_fields_4}&min_rnk={min_rank}&max_rnk={max_rank}&fmt={format_type}"
# Use requests to get each of the URLs, and save the returned data to a set of four variables, one for each search_terms string.
response_1 = requests.get(base_url_1, headers=headers).content
time.sleep(3)
response_2 = requests.get(base_url_2, headers=headers).content
time.sleep(3)
response_3 = requests.get(base_url_3, headers=headers).content
time.sleep(3)
response_4 = requests.get(base_url_4, headers=headers).content
time.sleep(3)
# Build the Pandas DataFrames from the returned data.
df1 = pd.read_csv(io.StringIO(response_1.decode('utf-8')), skiprows=10)
df2 = pd.read_csv(io.StringIO(response_2.decode('utf-8')), skiprows=10)
df3 = pd.read_csv(io.StringIO(response_3.decode('utf-8')), skiprows=10)
df4 = pd.read_csv(io.StringIO(response_4.decode('utf-8')), skiprows=10)
# Return the DataFrames.
return df1, df2, df3, df4
def build_dataframes():
"""
This function determines the number of trials, based on a 'test request,' and then executes the compile_dataframes function in an iterative manner.
INPUTS:
min_rank (int): The first row, in the returned dataset, to query.
max_rank (int): The maximum row, in the returned dataset, to query.
OUTPUTS:
df (DataFrame): A single DataFrame containing all rows and columns corresponding to the scraped clinical trial data.
"""
test_url = """https://clinicaltrials.gov/api/query/study_fields?expr=autism+OR+autism+spectrum+disorder+OR+Fragile+X+OR+Rett+syndrome+OR+tuberous+sclerosis+OR+Williams+syndrome+OR+
Praeder+Willi+syndrome+OR+Phelan+McDermid+syndrome+OR+Dup15q+OR+Angelman+OR+Timothy+syndrome+OR+16p+deletion+OR+16p+duplication+OR+ADNP+&fields=NCTId&min_rnk=1&max_rnk=3&fmt=csv"""
response_test = requests.get(test_url, headers=headers).content.decode('utf-8')
number_of_studies = int(re.search("(?<=NStudiesFound: )\d\d\d\d", response_test).group(0))
print("Number of studies found: " + str(number_of_studies))
min_rank = 1
if 1000 <= number_of_studies:
max_rank = 1000
else:
max_rank = number_of_studies
print("Max rank set to: " + str(max_rank))
df1x, df2x, df3x, df4x = compile_df(min_rank, max_rank)
df_x = pd.concat((df1x, df2x, df3x, df4x), axis=1, ignore_index=False) # First 1000 studies
print("Length of df_x: " + str(len(df_x)))
if 2000 <= number_of_studies:
max_rank += 1000
min_rank += 1000
else:
min_rank += 1000
max_rank = number_of_studies
df1y, df2y, df3y, df4y = compile_df(min_rank, max_rank)
df_y = pd.concat((df1y, df2y, df3y, df4y), axis=1, ignore_index=False) # Up to study 2000
print("Max rank set to: " + str(max_rank))
print("Length of df_y: " + str(len(df_y)))
if 3000 <= number_of_studies:
max_rank += 1000
min_rank += 1000
else:
min_rank += 1000
max_rank = number_of_studies
df1z, df2z, df3z, df4z = compile_df(min_rank, max_rank)
df_z = pd.concat((df1z, df2z, df3z, df4z), axis=1, ignore_index=False) # Up to study 3000
print("Max rank set to: " + str(max_rank))
print("Length of df_z: " + str(len(df_z)))
df = pd.concat([df_x, df_y, df_z], sort=False)
df.reset_index(inplace=True)
return df
def clean_dataframes(df):
"""
This function takes a DataFrame as input and cleans the data. Specifically, it returns only trials that are Phase II+,
that include Drugs as the intervention (not behavioral interventions), and that contain details on one of the queried conditions.
It also creates a 'Placebo' column, based on text in the 'ArmGroupInterventionName' column.
INPUTS:
df (DataFrame): A DataFrame containing the raw clinical trial data scraped from ClinicalTrials.gov.
OUTPUTS:
df_phase_drugs_extra_filter (DataFrame): A single DataFrame containing cleaned data from the input DataFrame.
"""
# Remove all trials that do not have a 'Phase' explicitly listed
df_phase = df[df['Phase'].notna()]
# Remove trials that say 'Not Applicable' for Phase data.
df_phase = df_phase[~df_phase['Phase'].str.contains("Applicab")]
# Remove Phase 1 clinical trials
df_phase = df_phase[~df_phase['Phase'].str.contains("1")]
# Remove trials that do not include 'drug' as an intervention type
df_phase_drugs = df_phase[df_phase['InterventionType'].str.contains("Drug", na=False)]
# Remove open label / non-placebo trials
df_phase_drugs_placebo = df_phase_drugs[~df_phase_drugs['DesignMasking'].str.contains("None", na=False)]
# Perform extra filtering, as some trials appear for certain lymphomas, etc.
extra_filter_mask = df_phase_drugs_placebo['ConditionBrowseLeafAsFound'].str.contains('Autis|autis|Rett|Angelman|Tuberous|sclerosis|Phelan|McDermid|15q|Fragile X|Asperger|Williams|Pervasive Developmental Disorder|16p|ADNP', regex=True, na=False)
s = pd.Series(extra_filter_mask)
df_phase_drugs_extra_filter = df_phase_drugs_placebo[s.values]
# Populate a placebo column automatically, based solely on ArmGroupInterventionName column (maybe not flawless)
df_phase_drugs_extra_filter['Placebo'] = df_phase_drugs_extra_filter['ArmGroupInterventionName'].apply(lambda x: 'Yes' if 'Placebo' in str(x) else 'No')
# Export to a .csv file, name it as 'master' with the current date.
return df_phase_drugs_extra_filter