From 2e5cd97a94391a05b5d7832042760ac5df6b2a85 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni Date: Sun, 24 Jun 2018 19:53:53 -0400 Subject: [PATCH] Add RTX prediction notebook (#18) * Ignore private data * Add exploratory data analyses for RTX * Add notebook for predicting response from baseline samples (RTX) --- 25-predict_response.Rmd | 266 ++++++++++ 25-predict_response.nb.html | 738 ++++++++++++++++++++++++++++ plots/25/total_accuracy_CI.pdf | Bin 0 -> 4913 bytes results/25/RTX_B_cv.glmnet.RDS | 3 + results/25/expression_cv.glmnet.RDS | 3 + results/25/recount2_B_cv.glmnet.RDS | 3 + 6 files changed, 1013 insertions(+) create mode 100644 25-predict_response.Rmd create mode 100644 25-predict_response.nb.html create mode 100644 plots/25/total_accuracy_CI.pdf create mode 100644 results/25/RTX_B_cv.glmnet.RDS create mode 100644 results/25/expression_cv.glmnet.RDS create mode 100644 results/25/recount2_B_cv.glmnet.RDS diff --git a/25-predict_response.Rmd b/25-predict_response.Rmd new file mode 100644 index 0000000..0f89e0d --- /dev/null +++ b/25-predict_response.Rmd @@ -0,0 +1,266 @@ +--- +title: "First pass at predicting response from baseline samples in RTX trial" +output: html_notebook +--- + +**J. Taroni 2018** + +## Install `caret` + +```{r} +# devtools::install_github('topepo/caret/pkg/caret', +# ref = "6546939345fe10649cefcbfee55d58fb682bc902") +# devtools::install_version("e1071", version = "1.6-8") +``` + + +## Functions and directory set up + +```{r} +# magrittr pipe +`%>%` <- dplyr::`%>%` +``` + +```{r} +# plot and result directory setup for this notebook +plot.dir <- file.path("plots", "25") +dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE) +results.dir <- file.path("results", "25") +dir.create(results.dir, recursive = TRUE, showWarnings = FALSE) +``` + +## Read in data + +#### Covariates + +```{r} +covariate.df <- readr::read_tsv(file.path("data", "rtx", + "RTX_full_covariates.tsv")) +``` + +#### Gene expression data + +This is gene-level expression data that has been vst-transformed and filtered +to only genes that are in the recount2 PLIER model. + +```{r} +exprs <- readRDS(file.path("data", "rtx", "VST_blind_filtered.RDS")) +``` + +#### recount2 `B` + +The multiPLIER approach + +```{r} +recount.b <- readRDS(file.path("data", "rtx", "RTX_recount2_B.RDS")) +``` + +#### RTX PLIER model + +```{r} +rtx.plier <- readRDS(file.path("data", "rtx", "RTX_PLIER_model.RDS")) +rtx.b <- rtx.plier$B +``` + + +## LASSO + +### Prep data + +First, we'll change the sample names to match the barcodes in the covariates. +The first six characters of the current column/sample names should correspond +to a barcode. + +```{r} +# in the expression data +colnames(exprs) <- substr(colnames(exprs), start = 1, stop = 6) +all(covariate.df$barcode == colnames(exprs)) +``` + +```{r} +# in the recount B data +colnames(recount.b) <- substr(colnames(recount.b), start = 1, stop = 6) +all(covariate.df$barcode == colnames(recount.b)) +``` + +```{r} +# in the RTX B +colnames(rtx.b) <- substr(colnames(rtx.b), start = 1, stop = 6) +all(covariate.df$barcode == colnames(rtx.b)) +``` + +The `mainclass` column in `covariate.df` is what we are interested in +predicting; it contains whether or not a patient is a _nonresponder_ or +_responder_ (divided into _tolerant_ or _nontolerant_ depending on, I believe, +long-term outcome) to treatment. +(We'll exclude samples with `NA` in this column.) + +We'll want to try and predict this from baseline samples +(`covariate.df$timepoint == "BL"`). +We will not be adjusting for covariates at this point. +The earlier publications on this trial suggest that the majority of +covariates have no significant association with response. + +Let's take a look at the sample size and class balance. + +```{r} +table(covariate.df$mainclass, covariate.df$timepoint) +``` + +We can see that there are `r sum(covariate.df$timepoint == "BL")` baseline +samples and that the three classes (`Nonresponder`, `Nontolerant`, and +`Tolerant`) are pretty balanced. +If we use these three classes, we can likely use a metric like total accuracy +to evaluate performance. +Also, the small sample size lends itself to leave-one-out cross-validataion +(LOOCV). + +```{r} +# Do all baseline samples have response labels? No, one is NA +baseline.covariate.df <- covariate.df %>% + dplyr::filter(timepoint == "BL") %>% + dplyr::select(c("barcode", "timepoint", "mainclass")) %>% + dplyr::filter(complete.cases(.)) +``` + +```{r} +# we only want the baseline samples with a class label +baseline.samples <- baseline.covariate.df$barcode +``` + +```{r} +baseline.exprs <- t(exprs[, which(colnames(exprs) %in% baseline.samples)]) +dim(baseline.exprs) +``` + +```{r} +recount.baseline.b <- + t(recount.b[, which(colnames(recount.b) %in% baseline.samples)]) +dim(recount.baseline.b) +``` + +```{r} +rtx.baseline.b <- t(rtx.b[, which(colnames(rtx.b) %in% baseline.samples)]) +dim(rtx.baseline.b) +``` + +```{r} +all(rownames(recount.baseline.b) == baseline.covariate.df$barcode) +``` + +```{r} +all(rownames(baseline.exprs) == baseline.covariate.df$barcode) +``` + +```{r} +all(rownames(rtx.baseline.b) == baseline.covariate.df$barcode) +``` + + +### Prediction + +```{r} +set.seed(12345) +``` + + +#### Expression data + +```{r} +exprs.results <- glmnet::cv.glmnet(x = baseline.exprs, + y = baseline.covariate.df$mainclass, + type.measure = "class", + family = "multinomial", + nfolds = nrow(baseline.exprs)) # LOOCV +saveRDS(exprs.results, file.path(results.dir, "expression_cv.glmnet.RDS")) +``` + +```{r} +exprs.predicted.labels <- stats::predict(exprs.results, + baseline.exprs, + s = exprs.results$lambda.1se, + type = "class") +caret::confusionMatrix(data = as.factor(exprs.predicted.labels), + reference = as.factor(baseline.covariate.df$mainclass)) +``` + +#### recount2 `B` + +```{r} +recount.b.results <- glmnet::cv.glmnet(x = recount.baseline.b, + y = baseline.covariate.df$mainclass, + type.measure = "class", + family = "multinomial", + nfolds = nrow(recount.baseline.b)) # LOOCV +saveRDS(recount.b.results, file.path(results.dir, "recount2_B_cv.glmnet.RDS")) +``` + +```{r} +recount.b.predicted.labels <- stats::predict(recount.b.results, + recount.baseline.b, + s = recount.b.results$lambda.1se, + type = "class") +caret::confusionMatrix(data = as.factor(recount.b.predicted.labels), + reference = as.factor(baseline.covariate.df$mainclass)) +``` + +#### RTX `B` + +```{r} +rtx.b.results <- glmnet::cv.glmnet(x = rtx.baseline.b, + y = baseline.covariate.df$mainclass, + type.measure = "class", + family = "multinomial", + nfolds = nrow(rtx.baseline.b)) # LOOCV +saveRDS(rtx.b.results, file.path(results.dir, "RTX_B_cv.glmnet.RDS")) +``` + +```{r} +rtx.b.predicted.labels <- stats::predict(rtx.b.results, + rtx.baseline.b, + s = rtx.b.results$lambda.1se, + type = "class") +caret::confusionMatrix(data = as.factor(rtx.b.predicted.labels), + reference = as.factor(baseline.covariate.df$mainclass)) +``` + +### Plotting accuracy + +```{r} +acc.df <- data.frame(Model = c("Expression", "RTX LVs", "multiPLIER LVs"), + Accuracy = c(0.9444, 1, 0.3889), + Lower = c(0.8134, 0.9026, 0.2314), + Upper = c(0.9932, 1, 0.5654)) +``` + +```{r} +acc.df %>% + ggplot2::ggplot() + + ggplot2::geom_pointrange(mapping = ggplot2::aes(x = Model, y = Accuracy, + ymin = Lower, ymax = Upper)) + + ggplot2::theme_bw() + + ggplot2::labs(title = "Predicting response with LASSO") + + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, + face = "bold")) + + ggplot2::theme(text = ggplot2::element_text(size = 15)) +``` + +```{r} +ggplot2::ggsave(file.path(plot.dir, "total_accuracy_CI.pdf"), + plot = ggplot2::last_plot()) +``` + +I wonder if the poor performance in the case of the multiPLIER LVs could be due +to a smaller range of values. + +```{r} +summary(as.vector(baseline.exprs)) +``` + +```{r} +summary(as.vector(recount.baseline.b)) +``` + +```{r} +summary(as.vector(rtx.baseline.b)) +``` \ No newline at end of file diff --git a/25-predict_response.nb.html b/25-predict_response.nb.html new file mode 100644 index 0000000..ee99611 --- /dev/null +++ b/25-predict_response.nb.html @@ -0,0 +1,738 @@ + + + + + + + + + + + + + +First pass at predicting response from baseline samples in RTX trial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + +

J. Taroni 2018

+
+

Install caret

+ + + +
# devtools::install_github('topepo/caret/pkg/caret', 
+#                          ref = "6546939345fe10649cefcbfee55d58fb682bc902")
+# devtools::install_version("e1071", version = "1.6-8")
+ + + +
+
+

Functions and directory set up

+ + + +
# magrittr pipe
+`%>%` <- dplyr::`%>%`
+ + + + + + +
# plot and result directory setup for this notebook
+plot.dir <- file.path("plots", "25")
+dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE)
+results.dir <- file.path("results", "25")
+dir.create(results.dir, recursive = TRUE, showWarnings = FALSE)
+ + + +
+
+

Read in data

+
+

Covariates

+ + + +
covariate.df <- readr::read_tsv(file.path("data", "rtx", 
+                                          "RTX_full_covariates.tsv"))
+ + +
Parsed with column specification:
+cols(
+  .default = col_character(),
+  barcode = col_integer(),
+  AGE = col_integer(),
+  bcells = col_double(),
+  HGB = col_double(),
+  `Platelet Count` = col_double(),
+  WBC = col_double(),
+  Lymphs = col_double(),
+  Neutrophils = col_double(),
+  Eosinophils = col_double(),
+  Tscore = col_integer()
+)
+See spec(...) for full column specifications.
+ + + +
+
+

Gene expression data

+

This is gene-level expression data that has been vst-transformed and filtered to only genes that are in the recount2 PLIER model.

+ + + +
exprs <- readRDS(file.path("data", "rtx", "VST_blind_filtered.RDS"))
+ + + +
+
+

recount2 B

+

The multiPLIER approach

+ + + +
recount.b <- readRDS(file.path("data", "rtx", "RTX_recount2_B.RDS"))
+ + + +
+
+

RTX PLIER model

+ + + +
rtx.plier <- readRDS(file.path("data", "rtx", "RTX_PLIER_model.RDS"))
+rtx.b <- rtx.plier$B
+ + + +
+
+
+

LASSO

+
+

Prep data

+

First, we’ll change the sample names to match the barcodes in the covariates. The first six characters of the current column/sample names should correspond to a barcode.

+ + + +
# in the expression data
+colnames(exprs) <- substr(colnames(exprs), start = 1, stop = 6)
+all(covariate.df$barcode == colnames(exprs))
+ + +
[1] TRUE
+ + + + + + +
# in the recount B data
+colnames(recount.b) <- substr(colnames(recount.b), start = 1, stop = 6)
+all(covariate.df$barcode == colnames(recount.b))
+ + +
[1] TRUE
+ + + + + + +
# in the RTX B
+colnames(rtx.b) <- substr(colnames(rtx.b), start = 1, stop = 6)
+all(covariate.df$barcode == colnames(rtx.b))
+ + +
[1] TRUE
+ + + +

The mainclass column in covariate.df is what we are interested in predicting; it contains whether or not a patient is a nonresponder or responder (divided into tolerant or nontolerant depending on, I believe, long-term outcome) to treatment. (We’ll exclude samples with NA in this column.)

+

We’ll want to try and predict this from baseline samples (covariate.df$timepoint == "BL"). We will not be adjusting for covariates at this point. The earlier publications on this trial suggest that the majority of covariates have no significant association with response.

+

Let’s take a look at the sample size and class balance.

+ + + +
table(covariate.df$mainclass, covariate.df$timepoint)
+ + +
              
+               BL M18 M6
+  Nonresponder 14  10  7
+  Nontolerant  10   3 10
+  Tolerant     12  12 12
+ + + +

We can see that there are 37 baseline samples and that the three classes (Nonresponder, Nontolerant, and Tolerant) are pretty balanced. If we use these three classes, we can likely use a metric like total accuracy to evaluate performance. Also, the small sample size lends itself to leave-one-out cross-validataion (LOOCV).

+ + + +
# Do all baseline samples have response labels? No, one is NA 
+baseline.covariate.df <- covariate.df %>%
+  dplyr::filter(timepoint == "BL") %>%
+  dplyr::select(c("barcode", "timepoint", "mainclass")) %>%
+  dplyr::filter(complete.cases(.))
+ + + + + + +
# we only want the baseline samples with a class label
+baseline.samples <- baseline.covariate.df$barcode
+ + + + + + +
baseline.exprs <- t(exprs[, which(colnames(exprs) %in% baseline.samples)])
+dim(baseline.exprs)
+ + +
[1]   36 6690
+ + + + + + +
recount.baseline.b <- 
+  t(recount.b[, which(colnames(recount.b) %in% baseline.samples)])
+dim(recount.baseline.b)
+ + +
[1]  36 987
+ + + + + + +
rtx.baseline.b <- t(rtx.b[, which(colnames(rtx.b) %in% baseline.samples)])
+dim(rtx.baseline.b)
+ + +
[1] 36 23
+ + + + + + +
all(rownames(recount.baseline.b) == baseline.covariate.df$barcode)
+ + +
[1] TRUE
+ + + + + + +
all(rownames(baseline.exprs) == baseline.covariate.df$barcode)
+ + +
[1] TRUE
+ + + + + + +
all(rownames(rtx.baseline.b) == baseline.covariate.df$barcode)
+ + +
[1] TRUE
+ + + +
+
+

Prediction

+ + + +
set.seed(12345)
+ + + +
+

Expression data

+ + + +
exprs.results <- glmnet::cv.glmnet(x = baseline.exprs,
+                                   y = baseline.covariate.df$mainclass,
+                                   type.measure = "class",
+                                   family = "multinomial",
+                                   nfolds = nrow(baseline.exprs))  # LOOCV
+ + +
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
+ + +
saveRDS(exprs.results, file.path(results.dir, "expression_cv.glmnet.RDS"))
+ + + + + + +
exprs.predicted.labels <- stats::predict(exprs.results, 
+                                         baseline.exprs,
+                                         s = exprs.results$lambda.1se,
+                                         type = "class")
+caret::confusionMatrix(data = as.factor(exprs.predicted.labels), 
+                       reference = as.factor(baseline.covariate.df$mainclass))
+ + +
Confusion Matrix and Statistics
+
+              Reference
+Prediction     Nonresponder Nontolerant Tolerant
+  Nonresponder           13           1        0
+  Nontolerant             0           9        0
+  Tolerant                1           0       12
+
+Overall Statistics
+                                          
+               Accuracy : 0.9444          
+                 95% CI : (0.8134, 0.9932)
+    No Information Rate : 0.3889          
+    P-Value [Acc > NIR] : 2.763e-12       
+                                          
+                  Kappa : 0.9157          
+ Mcnemar's Test P-Value : NA              
+
+Statistics by Class:
+
+                     Class: Nonresponder Class: Nontolerant Class: Tolerant
+Sensitivity                       0.9286             0.9000          1.0000
+Specificity                       0.9545             1.0000          0.9583
+Pos Pred Value                    0.9286             1.0000          0.9231
+Neg Pred Value                    0.9545             0.9630          1.0000
+Prevalence                        0.3889             0.2778          0.3333
+Detection Rate                    0.3611             0.2500          0.3333
+Detection Prevalence              0.3889             0.2500          0.3611
+Balanced Accuracy                 0.9416             0.9500          0.9792
+ + + +
+
+

recount2 B

+ + + +
recount.b.results <- glmnet::cv.glmnet(x = recount.baseline.b,
+                                       y = baseline.covariate.df$mainclass,
+                                       type.measure = "class",
+                                       family = "multinomial",
+                                       nfolds = nrow(recount.baseline.b))  # LOOCV
+ + +
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
+ + +
saveRDS(recount.b.results, file.path(results.dir, "recount2_B_cv.glmnet.RDS"))
+ + + + + + +
recount.b.predicted.labels <- stats::predict(recount.b.results, 
+                                             recount.baseline.b, 
+                                             s = recount.b.results$lambda.1se,
+                                             type = "class")
+caret::confusionMatrix(data = as.factor(recount.b.predicted.labels), 
+                       reference = as.factor(baseline.covariate.df$mainclass))
+ + +
Levels are not in the same order for reference and data. Refactoring data to match.
+ + +
Confusion Matrix and Statistics
+
+              Reference
+Prediction     Nonresponder Nontolerant Tolerant
+  Nonresponder           14          10       12
+  Nontolerant             0           0        0
+  Tolerant                0           0        0
+
+Overall Statistics
+                                          
+               Accuracy : 0.3889          
+                 95% CI : (0.2314, 0.5654)
+    No Information Rate : 0.3889          
+    P-Value [Acc > NIR] : 0.5628          
+                                          
+                  Kappa : 0               
+ Mcnemar's Test P-Value : NA              
+
+Statistics by Class:
+
+                     Class: Nonresponder Class: Nontolerant Class: Tolerant
+Sensitivity                       1.0000             0.0000          0.0000
+Specificity                       0.0000             1.0000          1.0000
+Pos Pred Value                    0.3889                NaN             NaN
+Neg Pred Value                       NaN             0.7222          0.6667
+Prevalence                        0.3889             0.2778          0.3333
+Detection Rate                    0.3889             0.0000          0.0000
+Detection Prevalence              1.0000             0.0000          0.0000
+Balanced Accuracy                 0.5000             0.5000          0.5000
+ + + +
+
+

RTX B

+ + + +
rtx.b.results <- glmnet::cv.glmnet(x = rtx.baseline.b,
+                                   y = baseline.covariate.df$mainclass,
+                                   type.measure = "class",
+                                   family = "multinomial",
+                                   nfolds = nrow(rtx.baseline.b))  # LOOCV
+ + +
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
+ + +
saveRDS(rtx.b.results, file.path(results.dir, "RTX_B_cv.glmnet.RDS"))
+ + + + + + +
rtx.b.predicted.labels <- stats::predict(rtx.b.results, 
+                                         rtx.baseline.b, 
+                                         s = rtx.b.results$lambda.1se,
+                                         type = "class")
+caret::confusionMatrix(data = as.factor(rtx.b.predicted.labels), 
+                       reference = as.factor(baseline.covariate.df$mainclass))
+ + +
Confusion Matrix and Statistics
+
+              Reference
+Prediction     Nonresponder Nontolerant Tolerant
+  Nonresponder           14           0        0
+  Nontolerant             0          10        0
+  Tolerant                0           0       12
+
+Overall Statistics
+                                     
+               Accuracy : 1          
+                 95% CI : (0.9026, 1)
+    No Information Rate : 0.3889     
+    P-Value [Acc > NIR] : 1.713e-15  
+                                     
+                  Kappa : 1          
+ Mcnemar's Test P-Value : NA         
+
+Statistics by Class:
+
+                     Class: Nonresponder Class: Nontolerant Class: Tolerant
+Sensitivity                       1.0000             1.0000          1.0000
+Specificity                       1.0000             1.0000          1.0000
+Pos Pred Value                    1.0000             1.0000          1.0000
+Neg Pred Value                    1.0000             1.0000          1.0000
+Prevalence                        0.3889             0.2778          0.3333
+Detection Rate                    0.3889             0.2778          0.3333
+Detection Prevalence              0.3889             0.2778          0.3333
+Balanced Accuracy                 1.0000             1.0000          1.0000
+ + + +
+
+
+

Plotting accuracy

+ + + +
acc.df <- data.frame(Model = c("Expression", "RTX LVs", "multiPLIER LVs"), 
+                     Accuracy = c(0.9444, 1, 0.3889), 
+                     Lower = c(0.8134, 0.9026, 0.2314), 
+                     Upper = c(0.9932, 1, 0.5654))
+ + + + + + +
acc.df %>%
+  ggplot2::ggplot() + 
+  ggplot2::geom_pointrange(mapping = ggplot2::aes(x = Model, y = Accuracy, 
+                                                  ymin = Lower, ymax = Upper)) + 
+  ggplot2::theme_bw() +
+  ggplot2::labs(title = "Predicting response with LASSO") +
+  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, 
+                                                    face = "bold")) +
+  ggplot2::theme(text = ggplot2::element_text(size = 15))
+ + +

+ + + + + + +
ggplot2::ggsave(file.path(plot.dir, "total_accuracy_CI.pdf"),
+                plot = ggplot2::last_plot())
+ + +
Saving 7 x 7 in image
+ + + +

I wonder if the poor performance in the case of the multiPLIER LVs could be due to a smaller range of values.

+ + + +
summary(as.vector(baseline.exprs))
+ + +
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+  3.399   6.428   8.460   8.419  10.282  19.637 
+ + + + + + +
summary(as.vector(recount.baseline.b))
+ + +
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
+-0.4666653 -0.0223182 -0.0020393 -0.0009504  0.0183729  0.8213279 
+ + + + + + +
summary(as.vector(rtx.baseline.b))
+ + +
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
+-2.612200 -0.470965 -0.123762  0.004056  0.327775  5.092082 
+ + +
+
+ +
---
title: "First pass at predicting response from baseline samples in RTX trial"
output: html_notebook
---

**J. Taroni 2018**

## Install `caret`

```{r}
# devtools::install_github('topepo/caret/pkg/caret', 
#                          ref = "6546939345fe10649cefcbfee55d58fb682bc902")
# devtools::install_version("e1071", version = "1.6-8")
```


## Functions and directory set up

```{r}
# magrittr pipe
`%>%` <- dplyr::`%>%`
```

```{r}
# plot and result directory setup for this notebook
plot.dir <- file.path("plots", "25")
dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE)
results.dir <- file.path("results", "25")
dir.create(results.dir, recursive = TRUE, showWarnings = FALSE)
```

## Read in data

#### Covariates

```{r}
covariate.df <- readr::read_tsv(file.path("data", "rtx", 
                                          "RTX_full_covariates.tsv"))
```

#### Gene expression data

This is gene-level expression data that has been vst-transformed and filtered 
to only genes that are in the recount2 PLIER model.

```{r}
exprs <- readRDS(file.path("data", "rtx", "VST_blind_filtered.RDS"))
```

#### recount2 `B`

The multiPLIER approach

```{r}
recount.b <- readRDS(file.path("data", "rtx", "RTX_recount2_B.RDS"))
```

#### RTX PLIER model

```{r}
rtx.plier <- readRDS(file.path("data", "rtx", "RTX_PLIER_model.RDS"))
rtx.b <- rtx.plier$B
```


## LASSO

### Prep data

First, we'll change the sample names to match the barcodes in the covariates.
The first six characters of the current column/sample names should correspond
to a barcode.

```{r}
# in the expression data
colnames(exprs) <- substr(colnames(exprs), start = 1, stop = 6)
all(covariate.df$barcode == colnames(exprs))
```

```{r}
# in the recount B data
colnames(recount.b) <- substr(colnames(recount.b), start = 1, stop = 6)
all(covariate.df$barcode == colnames(recount.b))
```

```{r}
# in the RTX B
colnames(rtx.b) <- substr(colnames(rtx.b), start = 1, stop = 6)
all(covariate.df$barcode == colnames(rtx.b))
```

The `mainclass` column in `covariate.df` is what we are interested in 
predicting; it contains whether or not a patient is a _nonresponder_ or 
_responder_ (divided into _tolerant_ or _nontolerant_ depending on, I believe, 
long-term outcome) to treatment. 
(We'll exclude samples with `NA` in this column.)

We'll want to try and predict this from baseline samples 
(`covariate.df$timepoint == "BL"`).
We will not be adjusting for covariates at this point.
The earlier publications on this trial suggest that the majority of 
covariates have no significant association with response.

Let's take a look at the sample size and class balance.

```{r}
table(covariate.df$mainclass, covariate.df$timepoint)
```

We can see that there are `r sum(covariate.df$timepoint == "BL")` baseline 
samples and that the three classes (`Nonresponder`, `Nontolerant`, and 
`Tolerant`) are pretty balanced.
If we use these three classes, we can likely use a metric like total accuracy
to evaluate performance.
Also, the small sample size lends itself to leave-one-out cross-validataion 
(LOOCV).

```{r}
# Do all baseline samples have response labels? No, one is NA 
baseline.covariate.df <- covariate.df %>%
  dplyr::filter(timepoint == "BL") %>%
  dplyr::select(c("barcode", "timepoint", "mainclass")) %>%
  dplyr::filter(complete.cases(.))
```

```{r}
# we only want the baseline samples with a class label
baseline.samples <- baseline.covariate.df$barcode
```

```{r}
baseline.exprs <- t(exprs[, which(colnames(exprs) %in% baseline.samples)])
dim(baseline.exprs)
```

```{r}
recount.baseline.b <- 
  t(recount.b[, which(colnames(recount.b) %in% baseline.samples)])
dim(recount.baseline.b)
```

```{r}
rtx.baseline.b <- t(rtx.b[, which(colnames(rtx.b) %in% baseline.samples)])
dim(rtx.baseline.b)
```

```{r}
all(rownames(recount.baseline.b) == baseline.covariate.df$barcode)
```

```{r}
all(rownames(baseline.exprs) == baseline.covariate.df$barcode)
```

```{r}
all(rownames(rtx.baseline.b) == baseline.covariate.df$barcode)
```


### Prediction

```{r}
set.seed(12345)
```


#### Expression data

```{r}
exprs.results <- glmnet::cv.glmnet(x = baseline.exprs,
                                   y = baseline.covariate.df$mainclass,
                                   type.measure = "class",
                                   family = "multinomial",
                                   nfolds = nrow(baseline.exprs))  # LOOCV
saveRDS(exprs.results, file.path(results.dir, "expression_cv.glmnet.RDS"))
```

```{r}
exprs.predicted.labels <- stats::predict(exprs.results, 
                                         baseline.exprs,
                                         s = exprs.results$lambda.1se,
                                         type = "class")
caret::confusionMatrix(data = as.factor(exprs.predicted.labels), 
                       reference = as.factor(baseline.covariate.df$mainclass))
```

#### recount2 `B`

```{r}
recount.b.results <- glmnet::cv.glmnet(x = recount.baseline.b,
                                       y = baseline.covariate.df$mainclass,
                                       type.measure = "class",
                                       family = "multinomial",
                                       nfolds = nrow(recount.baseline.b))  # LOOCV
saveRDS(recount.b.results, file.path(results.dir, "recount2_B_cv.glmnet.RDS"))
```

```{r}
recount.b.predicted.labels <- stats::predict(recount.b.results, 
                                             recount.baseline.b, 
                                             s = recount.b.results$lambda.1se,
                                             type = "class")
caret::confusionMatrix(data = as.factor(recount.b.predicted.labels), 
                       reference = as.factor(baseline.covariate.df$mainclass))
```

#### RTX `B`

```{r}
rtx.b.results <- glmnet::cv.glmnet(x = rtx.baseline.b,
                                   y = baseline.covariate.df$mainclass,
                                   type.measure = "class",
                                   family = "multinomial",
                                   nfolds = nrow(rtx.baseline.b))  # LOOCV
saveRDS(rtx.b.results, file.path(results.dir, "RTX_B_cv.glmnet.RDS"))
```

```{r}
rtx.b.predicted.labels <- stats::predict(rtx.b.results, 
                                         rtx.baseline.b, 
                                         s = rtx.b.results$lambda.1se,
                                         type = "class")
caret::confusionMatrix(data = as.factor(rtx.b.predicted.labels), 
                       reference = as.factor(baseline.covariate.df$mainclass))
```

### Plotting accuracy

```{r}
acc.df <- data.frame(Model = c("Expression", "RTX LVs", "multiPLIER LVs"), 
                     Accuracy = c(0.9444, 1, 0.3889), 
                     Lower = c(0.8134, 0.9026, 0.2314), 
                     Upper = c(0.9932, 1, 0.5654))
```

```{r}
acc.df %>%
  ggplot2::ggplot() + 
  ggplot2::geom_pointrange(mapping = ggplot2::aes(x = Model, y = Accuracy, 
                                                  ymin = Lower, ymax = Upper)) + 
  ggplot2::theme_bw() +
  ggplot2::labs(title = "Predicting response with LASSO") +
  ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, 
                                                    face = "bold")) +
  ggplot2::theme(text = ggplot2::element_text(size = 15))
```

```{r}
ggplot2::ggsave(file.path(plot.dir, "total_accuracy_CI.pdf"),
                plot = ggplot2::last_plot())
```

I wonder if the poor performance in the case of the multiPLIER LVs could be due
to a smaller range of values. 

```{r}
summary(as.vector(baseline.exprs))
```

```{r}
summary(as.vector(recount.baseline.b))
```

```{r}
summary(as.vector(rtx.baseline.b))
```
+ + + +
+ + + + + + + + diff --git a/plots/25/total_accuracy_CI.pdf b/plots/25/total_accuracy_CI.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f262015ef69f3bced595d68c0ba9fb7aff6c0152 GIT binary patch literal 4913 zcmb7I2{=^!_qT*hWnY?7E>R-x%osb_CPcCu*)qmpYRqVslI&!sP)c?R>GdMY7Ajdv zb}3~GMHI>!s^1;G_4fAu{{QFs&GXDW_kQmAo^!tE&gVSm^OZKo7|0_P&@k!X(crP* zj^G|gDhvrA0E+7gn5HHS)Ta_~bRva}!O;nT3`QM=K&m2?pt@pfaB-DxejZt~O#(n9zRM!GkaY*^QY6 z>Ss0pXnoe30D$^9I*vr~06=q`2Z07CLI#$=-wRa!wSZ~Iw5Afs^uNqOeF~Wl-O>Qn zjpd*vfkt6a@sL0CZ~mADKgw~0K=z<}0xBqF05l+y=+Fj00}@ml3;|DZBQVpZ(V>!g z!FG0LAVV@$^=r7EQmp5-Nl=LLJ-nq;dAtKG8O#NmFI#shGR9GBB^^X)t@irCUf0j=xyk<|aJYJmxVlHg;jAC^&y~YRC2? zTX;Mc21L^MSrlO3#zfn!m&!68SU9lA#3hIJ78tLBCF5B_`f_6L<^EW*S#e!Al^k*A zZowJ`WhY0myprUjI>bM`f7|@7o_tP~OjB1|k%5zRV7Z`wYeo0vfBMQZic_n%PzC*AB-+%21 zA7CzGkTtRkxCYlb6-DT0%}HUss=em7h%Fs_A(=a)Q=|BdTyUjR+BUX5B08ckzEK~W zx|d~-4*GnFUARzVRyn@uWH&J#FcE< z^Ea9CbxP}9dni0;=9*RRXH_ZluBmhL;WzD6m9*@+h3;yNDe&lEWd+PjJ6=~~X{$7C zPpU0=BwvzmJjV)NzbRE+4$jvgi;&tKkAI$y-87G|ldxZm6CqE%oV-mb-?|5$bE~dv zM#nBP&+25Zitw=9=vUg11tw}hisQ*b%Y{Fe_p&}&l=+{cr$P$@uH_gH_wr z0_!gf?R4mbZF?7O1QRuo5UrdX?yPYf&I#0?p$dG{WKTP#G7(aN5BRhepmp-v=fT3} zxWs_M8Yyh(n~SK1ky9TL^_t(jQ6V(n*F%29@aN{4wF<@MXlvyTjU8-*gSuZnRD~4_ zMo&CadzM`>HFvjjz+b#vgZ+7KOwBt|sTNA>^kd8V*3;wCFw^4#agJQo5lN57)XP7u zjduh}$nNPn5mQi6HEw5{C7bg^^I7X6Nj+2ZDOTI4Jaw(cMlm%1bAy?xw%7%2ujQaXSviQ4fBnDzVkeV)-^uZVo!?0bnh@P+fCIoJqb1-3k(R=M$cF?#Q-T{2r$_Mt z91zemB?OxJb^3+BO#c1g8`LK7Rs=fW078ulia_w8JHbE$h`WHMCPY0XfMj~Hgf3A4 zlG(VJmx=(A*)=wr0(i)WZfHekwl8P}#7`&_5`}8zjl*vsAkETH4^aAH{&yQ;Ld-NA zz{C3bdN>-v4NzuUIsIr%%)1|3*JZW>GBXH>A)F@S2~4NpQK;E~f3*yhlG=ao8O)r1 z_Kfu8GuBr9A`u<6Dz)Z`e9@OWPkLIi8EvaIN>GqRa`PGAzsMq~jIs8{SR~|aM#+o> z%-W7Yk6Hh|YL<)Zp&Sp7 z*l`DK&9)PIJ<%8$7vtf=#kzHlg`LDQ;$2pzC$eI|mcr)uB9@dQxFT9x%>#U1lkL%t zeZcZ9^HS9@NhFK;)!2zoEB%ge(Z{M<*fi-1v&UEOqD|Wo#_f{FLQu9zv`+^d{FYJ( zylV5PFim(Xt$ma*H$Jd$EKW_JE;hjhE;_M!r3zKI7avML9In$u883zV9aUzLNgPmk zOS5t#$>S4cP$BYTXIVy!9v`3U2JN192)6Pzwm-F35Rl4`gzt5<3Xe+?jCtMBVLB1h z_!PmT61Qg*`62~=Ozbt4#l2(wn9x}-_-#pz7piKi2rqfBI+@-;r}fmt<Ks z98Fj=PwodNXIC?M0`-skby(I7&c4C<%qOWP(vkYkDr9%xJj|n(_EGQ8Ti<3ztcY`c zW$YH=Uze(j1TP)JwB{WFjwIi4Xc3I$NYM+|S;@S%!rSomdVf`N)XNqA&SX2b8xlKo z0(P^=$npc+=iKBvXZ^P{8M1$Fi|I?`18 zWbvazmA$vqgYhG1RW60#R+mi!td8!L)<(`GffQ)pKGH{2~tx^b&le zxcMb+T@<*^M$~zEarbrcs%WEx{1yix&O?Wg=0`4Vc_J=+el#aJ+csM{x6p#iJiXvx zuT8Ja3D_dPslmydwyyR`7Wx5oczTS20X8eObJt|qB>#fysxW70Ha->^zdi7X*k0i> zwla9x&YpO+^YErvTwbZ&AL@N#)6v@+sU@x?1^&?@$z>hazvvz!@gp6P;_n%=#da^kA?8Ht3 zRg+2+A6+Sd#mJ+NymNvo!|e`hVl>Ufr0b+DjNGM^_XkT!cu9QNmo7OdSqmgRHgMym zY92c>q(|PHDKWgS-XcrZPIi;boRF=9$DDmu+F7hzYFg!N7?ins9CG&pDm-teByXxD0HAPa#a7dMYPPCjE4-~I;_x) zU{%%2=rx!+Hl?~vUonrP(~@|joZ=Z;zF#;xI^DS3IKw#Z$tmIeyRvt7u}{8RK&|3d z8LJvL@-}8RbyItvDL<1dEPnR3;B3LiXU4rRpBeNH-Bj%L@9pWWxOwL0aE}i0DlzQj z_=)#js<%GUdA#CV#b(kH)gmSa`ofgj(0kBW&zT;v&a}SnzE3?*x(mD2a~G{s|NM}h zURbNY7PfSq$EMlTFsn>|;(9wi9M+JO&|cuSfXt&SF}^UQs~4+7s}+5ld@X#>`+o3U zUf^Cm@I~s&uGQLAD^7Jz2~G~qBF>6!AHxqc2sJo`e+e&+i;Z)QRo>2kYoTlvc@XU? zWTm>3ddjC%j=6WcP}_v?Pf(_cgI>PN7bTrM=;_}a?kebM<=KOuzg3hX z|NL&#(o4rrj>A>ACQf(wlRe1I6|eATdNNLkyVgIbd;T>vG&i&*yA9XfbsyPP+V_z& zrU{1ka!)^_r*T%T=(PIVg}(j=$KLzB_$(@#XL8LXrZx9$)BMW(j)}OcsVb`Dw9^gp z)(T8TE!pyn)0y~lp=<2C{=6B6NzGyTH$4g`bUQqp`Zo~i zt}|j(@Qu|6+Ay|5rN83YnI|1Xl=H*;1Iq(1epCLolhvMeAn0w-O`YcZmxKLvuV-=S zKGWq5X$v{@Kuf~g$fafJV$J-vi7gXn&WCgcb$-ZRei+-3eBNI?o||_|F?Zm0oS+3? zSA9^m%j1w_j$+AL$yUi(ObX_3j&H7Tu3%1T?(t@y=1a{+_##(qlXG-Wq~Ie)Gkz?0 zEOj(rboIbMmAB@DgW0ODS0%>|osCgA&}Wxcx3VJ8D4iy4D9{y!jFOLu?m^SIYgcNFJQ5oJ%yqRXzgrd|2qs4NW=P$KB}$K0-I zME7^}20g;uc1B$rOSVdy->ag&Y?MLGKu}z&`ahYr?)&6FU+)}j|G=rsAH->fr?!+ScMKy{MGK;wKOig;p6x#8du)mGsj^qDq>1ZZkyP{9s8XK=6CK)`7?|c zjpDGT=Zf4v<&y_`LNia8*%PJNqo7#$BL5|*PYz!f>Vw|YR@irOMi%;-hFU~PkBZ2jr_55t+UJG zJzBOj$2UIx+z+F21T&Am%}hIZnKAY5-FOea*2H%}SG~U_G>3#LM_f6XUfUGgDHF0&$U7Q*bViDD#hSLl@L&a}2dhK!#wzfE< zM#TF=cguRh>uHPUlp&)^iQ30CpC@fsizX^)@;=(TIhI|I;PN}Z+v zt%Flq>T@m+LN)LMrTerA-xdPyO-g*mHHNig=J&iZY~Jg?57c&E@mW9Imz0vhXg57vH*P2($DT3tzDCVt3~&_bdWwBl6Q;-)gR& z5mc<2EuEFqz8Wz3dHcf0nfzF@%mDV~u=%xrP2g%j$57Z``uGqT(xOB%B#@b7>)&@+ zB3YMABmMwjAcpAfPJl!)Q}sFkXeH2#NM_IgHD&M=gF+{e2<~(k5{(AkD3Fe)(TD)( zLB*YhW?USe!5nq*L@J))9XxC0<{BS}0Y2uLK{FSsXz?17^)yhu0(9RMjF6f)r?6b-lGnZ}&loqjnW z*iZmJ2J!DF1RDYYv|_l@Hx3M#Af!Bw;=G`gAkDB*m+vDeLKWJ?Kkbl66-a3PhAAQ-(f2!s zhGg15F$9WPgWoZ=e_?0@lN-$~ZLuLlB{{Zb$GIam| literal 0 HcmV?d00001 diff --git a/results/25/RTX_B_cv.glmnet.RDS b/results/25/RTX_B_cv.glmnet.RDS new file mode 100644 index 0000000..d7fcce6 --- /dev/null +++ b/results/25/RTX_B_cv.glmnet.RDS @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:332b306d3737ad27524b2c860618306379b31a7efcdb79fa64c90e9710e6271c +size 23758 diff --git a/results/25/expression_cv.glmnet.RDS b/results/25/expression_cv.glmnet.RDS new file mode 100644 index 0000000..a8e0b3a --- /dev/null +++ b/results/25/expression_cv.glmnet.RDS @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7937661811d92b89da062a87b18645b074c366480b3cfa71dccada39d120212 +size 88769 diff --git a/results/25/recount2_B_cv.glmnet.RDS b/results/25/recount2_B_cv.glmnet.RDS new file mode 100644 index 0000000..cac2cd3 --- /dev/null +++ b/results/25/recount2_B_cv.glmnet.RDS @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479aac1618d591e773fa2b93fd14aef37d7350ece54b805389efc0b8de885bdb +size 44988