-
Notifications
You must be signed in to change notification settings - Fork 9
/
Watson-NLC-ICD10-2018-shared.R
229 lines (168 loc) · 8.82 KB
/
Watson-NLC-ICD10-2018-shared.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
######################################################
### Experimental Code. Experimental R Interface for IBM Watson
### NLC to help with INITIAL TRIAGE of Emails that may come into a bank or insurance company
### Focus: Natural Language Classifier - R Programming Language Interface
### GROUND TRUTH IS VERY LIGHT - HANDLE WITH CARE - was authored in 20 minutes and contains NO real world data
### Could be used to BOOTSTRAP actual data - best practices are to use REAL WORLD Data for training
### HANDBOOK Natural Language Classifier (NLC) Handbook: https://ibm.box.com/s/rdlog2sue79178816s0rabkbi7ifu5vg
### Video #1 - Training - https://www.youtube.com/watch?v=nrD37M39QnA
### Video #2 - Testing - https://youtu.be/kBx6reEj4Gg
### Training Data - Ground truth: https://github.com/rustyoldrake/NLC_ground_truth_wonderland/blob/master/ground_truth_gt_NLC_example_email_sort.csv
### This R code lives here https://github.com/rustyoldrake/NLC_ground_truth_wonderland
### NEW VIDEOS Part 1 https://www.youtube.com/watch?v=HC6O6HczqC0 - Part 2 - https://youtu.be/skWlP9U78rE
#######################################################
library(RCurl) # install.packages("RCurl") # if the package is not already installed
library(httr)
library(data.table)
library(dplyr)
library(reshape2)
library(Rtts)
library(splitstackshape)
library(stringr)
library(splitstackshape)
library(tidyr)
library(XML)
library(png)
######### Housekeeping And Authentication
setwd("/Users/ryan/Documents/Project ICD-Codes/") # Set working Directory
getwd()
#source("keys.r") ## KEYS has acutal username:password for each IBM service. Seperate R file looks sort of like below
# NLC CREDENTIALS
username_NLC <- "#####"
password_NLC = "#####"
username_password_NLC = paste(username_NLC,":",password_NLC,sep="")
## You will need to go to IBM Cloud / Bluemix and Create an NLC Service and GET SERVICE CREDENTIALS once it is active
username_password_NLC # check you got this from NLC file
base_url_NLC = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/"
getURL(base_url_NLC,userpwd = username_password_NLC ) # non essential , but checks if working /authenticated
###### FUNCTION CREATE NEW CLASSIFIER - post /v1/classifiers - Creates a classifier with CSV data ## URL below no "/" after base url
watson.nlc.createnewclassifier <- function(file,classifiername) {
return(POST(url="https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers",
authenticate(username_NLC,password_NLC),
body = list(training_data = upload_file(file),
training_metadata = paste("{\"language\":\"en\",\"name\":",classifiername,"}",sep="")
)))}
###### end of function
###### FUNCTION - CHECK CLASSIFIER STATUS
watson.nlc.checkclassifierstatus <- function(classifier_id) {
return(
getURL(paste(base_url_NLC,classifier_id,sep=""),userpwd = username_password_NLC)
)
}
### end of function
###### FUNCTION - DELETE CLASSIFIER - Receives name of Classifier to Kill; May not be able to do this until training complete
watson.nlc.deleteclassifier <- function(kill_classifier) {
DELETE(url=(paste(base_url_NLC,kill_classifier,sep="")),authenticate(username_NLC,password_NLC))
}
### end of function
###### FUNCTION: ACCEPT QUERY & RETURN RESULT: CLASSIFIER and % FROM TEXT INPUT AND PROCESS TO LOOK GOOD
watson.nlc.processtextreturnclass <- function(classifier_id,query_text){
query_text <- URLencode(query_text)
data <- getURL(paste(base_url_NLC,classifier_id,"/classify","?text=", query_text,sep=""),userpwd = username_password_NLC)
data <- as.data.frame(strsplit(as.character(data),"class_name"))
data <- data[-c(1), ] # remove dud first row
data <- gsub("[{}]","", data)
data <- gsub("confidence","", data)
data <- data.frame(matrix(data))
setnames(data,("V1"))
data$V1 <- gsub("\"","", data$V1)
data$V1 <- gsub(":","", data$V1)
data$V1 <- gsub("]","", data$V1)
data <- cSplit(data, 'V1', sep=",", type.convert=FALSE)
setnames(data,c("class","confidence"))
return(data) }
### end of function
###### FUNCTION: LIST ALL CLASSIFIERS AND RETURN NEAT LIST
watson.nlc.listallclassifiers <- function(){
data <- getURL(base_url_NLC,userpwd = username_password_NLC )
data <- as.data.frame(strsplit(as.character(data),"classifier_id"))
data <- data[-c(1), ] # remove dud first row
data <- data.frame(matrix(data))
colnames(data) <- "V1"
data$V1 <- gsub("[{}]","", data$V1)
data$V1 <- gsub("]","", data$V1)
data$V1 <- gsub("\"","", data$V1)
data$V1 <- gsub("name:","", data$V1)
data$V1 <- gsub(":","", data$V1)
data <- cSplit(data, 'V1', sep=",", type.convert=FALSE)
data[,c(2,4)] <- NULL
data <- as.data.table(data)
setnames(data,c("classifier","name","date_created"))
data <- data[order(date_created),]
return(data)
}
username_password_NLC # check you got this from NLC file
base_url_NLC = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/"
getURL(base_url_NLC,userpwd = username_password_NLC ) # non essential , but checks if working /authenticated
##### ACTION: EXECUTE FUNCTION TO KILL (!!!) DELETE (!!!) CLASSIFIER - WARNING
watson.nlc.listallclassifiers() # inventory - what do we want to delete - classifier id
#kill <- "842a87x335-nlc-527"
# watson.nlc.deleteclassifier(kill) ## CAREFUL HERE - UNCOMMENT TO KILL CLASSIFIER
watson.nlc.listallclassifiers() # check it's gone
username_password_NLC # check you got this from NLC file
base_url_NLC = "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/"
getURL(base_url_NLC,userpwd = username_password_NLC ) # non essential , but checks if working /authenticated
######################################################### END OF FUNCTION DECLARATIONS
######################################################### OK LETS DO STUFF
thefile <- "ICD-10-GT-2018-GROUPA-0to20k.csv" #
thename <- "\"ICD-10-GT-2018-GROUPAA\"" #
thefile <- "ICD-10-GT-2018-GROUPB-20k-40k.csv" #
thename <- "\"ICD-10-GT-2018-GROUPB\"" #
thefile <- "ICD-10-GT-2018-GROUPC-40k-60k.csv" #
thename <- "\"ICD-10-GT-2018-GROUPC\"" #
thefile <- "ICD-10-GT-2018-GROUPD-60k-69k.csv" #
thename <- "\"ICD-10-GT-2018-GROUPD\"" #
watson.nlc.createnewclassifier(thefile,thename) # calls function, passes file and name from above, starts the magic. might take 2 to 20+ minutes depending on complexityclassifier_id" : "563C46x19-nlc-377",
###### ACTION: Create a new CLassifier! (200 = Good outcome) -
thename <- "\"ICD-10-GT-2018-GROUPA\"" #
thefile <- "ground_truth_gt_NLC_example_email_sort.csv" #
watson.nlc.createnewclassifier(thefile,thename) # calls function, passes file and name from above, starts the magic. might take 2 to 20+ minutes depending on complexityclassifier_id" : "563C46x19-nlc-377",
###### ACTION: Retrieve list of classifiers (NEAT VERSION) - oldest to newest
watson.nlc.listallclassifiers() # not happy response if no classifiers (Blank) if blank, use below
## ARE WE READY? (might take 10-15m or more if really complex - be patient for the magic!)
classifierA <- "51233dx385-nlc-22" # #### COPY PASTE * YOUR* CLASSIFIER HERE TO CHECK STATUS ####
classifierB <- "511c9dx382-nlc-34" #
classifierC <- "5120e7x384-nlc-38" #
classifierD <- "511c9dx382-nlc-35" #
watson.nlc.checkclassifierstatus(classifierA)
watson.nlc.checkclassifierstatus(classifierB)
watson.nlc.checkclassifierstatus(classifierC)
watson.nlc.checkclassifierstatus(classifierD)
### READY OR NOT?
# if new will say "not yet ready to accept classify requests" - once done in a few mintues will say
# "The classifier instance is now available and is ready to take classifier requests" - then you can submit query below
# LIGHT MANUAL TESTING
query <- "Slipped on ice. Fractured skull"
watson.nlc.processtextreturnclass(classifierA,query)
watson.nlc.processtextreturnclass(classifierB,query)
watson.nlc.processtextreturnclass(classifierC,query)
watson.nlc.processtextreturnclass(classifierD,query)
four_scores <- function()
{
## gets each of four classifiers
data_a <- watson.nlc.processtextreturnclass(classifierA,query)
data_b <- watson.nlc.processtextreturnclass(classifierB,query)
data_c <- watson.nlc.processtextreturnclass(classifierC,query)
data_d <- watson.nlc.processtextreturnclass(classifierD,query)
## one ring to bind them all
data <- rbind(data_a,data_b,data_c,data_d)
## sort decending
data <- data[order(-confidence),]
head(data,10)
return(head(data,12)) # return just top 12
}
four_scores()
query <- "Streptococcus pneumoniae"
four_scores()
query <- "left ear shot off with laser gun"
four_scores()
query <- "cat bite on the left leg"
four_scores()
query <- "bit on the ear from a horse"
four_scores()
query <- "rapid heart rate and chest pain"
four_scores()
query <- "skin rash due to exposure to poison ivy"
four_scores()
query <- "broken ankle from motorcycle accident"
four_scores()