-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathXGBOOST.py
107 lines (88 loc) · 4.1 KB
/
XGBOOST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import keras as ks
from keras import Model
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense,Dropout,BatchNormalization
from sklearn.model_selection import KFold
from keras import regularizers
from keras.optimizers import SGD
from keras.constraints import UnitNorm
import os
from xgboost import XGBClassifier
import numpy as np
def get_threshold_metrics(y_true, y_pred, drop_intermediate=False,
disease='all'):
"""
Retrieve true/false positive rates and auroc/aupr for class predictions
Arguments:
y_true - an array of gold standard mutation status
y_pred - an array of predicted mutation status
disease - a string that includes the corresponding TCGA study acronym
Output:
dict of AUROC, AUPR, pandas dataframes of ROC and PR data, and cancer-type
"""
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score
roc_columns = ['fpr', 'tpr', 'threshold']
pr_columns = ['precision', 'recall', 'threshold']
if drop_intermediate:
roc_items = zip(roc_columns,
roc_curve(y_true, y_pred, drop_intermediate=False))
else:
roc_items = zip(roc_columns, roc_curve(y_true, y_pred))
roc_df = pd.DataFrame.from_dict(dict(roc_items))
prec, rec, thresh = precision_recall_curve(y_true, y_pred)
pr_df = pd.DataFrame.from_records([prec, rec]).T
pr_df = pd.concat([pr_df, pd.Series(thresh)], ignore_index=True, axis=1)
pr_df.columns = pr_columns
auroc = roc_auc_score(y_true, y_pred, average='weighted')
aupr = average_precision_score(y_true, y_pred, average='weighted')
return {'auroc': auroc, 'aupr': aupr, 'roc_df': roc_df,
'pr_df': pr_df, 'disease': disease}
gpu_id = '0,1'
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
os.system('echo $CUDA_VISIBLE_DEVICES')
ks.backend.clear_session()
print('reading')
x_np=np.array(pd.read_csv('genefile.csv',index_col=0,header=0))
y_df=np.array(pd.read_csv('statefile.csv',index_col=0,header=0))
strat=pd.read_csv('stratfile.csv',index_col=0,header=0)
print('read down')
input_dim=2048
x_train_all,x_test_all, y_train_all, y_test_all =train_test_split(x_df.iloc[:,:input_dim],y_df,test_size=0.1, random_state=0,stratify=strat)#,stratify=strat)
seed = 1
np.random.seed(seed)
all_specificity = list()
all_precision = list()
all_sensitivity = list()
accurancy_sum = 0
loss_sum = 0
k_fold = KFold(5,True, random_state=1)
index = k_fold.split(X=x_train_all, y=y_train_all)
index_train_all=np.array(x_train_all.index)
cv_results_df=list()
model = XGBClassifier()
for train_index, test_index in index:
X_train = np.array(x_train_all.iloc[train_index,:])
X_cv = np.array(x_train_all.iloc[test_index,:])
y_train = np.array(y_train_all.iloc[train_index,:])
y_cv = np.array(y_train_all.iloc[test_index,:])
test_index=np.array(index_train_all[test_index])
print("Training --------------------")
# kernel = 'linear'
model.fit(X_train, y_train)
y_pred = np.array(model.predict(X_cv))[:,np.newaxis]
print(y_pred)
print("\nTesting --------------------")
cv_pred=np.concatenate((y_pred,y_cv),axis=1)
cv_df=pd.DataFrame(data=cv_pred, columns=['dignosis','total_status'],index=test_index)
cv_results_df.append(cv_df)
cv_df=pd.concat(cv_results_df,axis=0)
cv_df.to_csv('xgbcv_results.csv')
train_pred=np.concatenate((np.array(model.predict(np.array(x_train_all.loc[:,:])))[:,np.newaxis],np.array(y_train_all.loc[:,:])),axis=1)
train_df=pd.DataFrame(data=train_pred, columns=['dignosis','total_status'],index=x_train_all.index)
train_df.to_csv('xgbtrain_results.csv')
test_pred=np.concatenate((np.array(model.predict(np.array(x_test_all.loc[:,:])))[:,np.newaxis],np.array(y_test_all.loc[:,:])),axis=1)
test_df=pd.DataFrame(data=test_pred, columns=['dignosis','total_status'],index=x_test_all.index)
test_df.to_csv('xgbtest_results.csv')