Summary of Findings:
As far as the TMB comparisons go, it doesn’t matter if we use Variant_Classification
filters or not. They are highly correlated no matter what and the same general TCGA to PBTA differences seem to persist.
Setup
# Magrittr pipe
`%>%` <- dplyr::`%>%`
source(file.path("..", "..", "tmb-compare-tcga", "util", "cdf-plot-function.R"))
dir.create("plots", showWarnings = FALSE)
Read in the TMB files
Read in PBTA data
tmb_pbta_no_filter <- data.table::fread(file.path(
"..",
"results",
"no_filter",
"pbta-snv-mutation-tmb-coding.tsv"
)) %>%
# This variable is weird when binding but we don't need it for the plot so we'll just remove it.
dplyr::select(-region_size) %>%
dplyr::filter(experimental_strategy != "Panel") %>%
dplyr::mutate(filter = "no filter")
tmb_pbta_with_filter <- data.table::fread(file.path(
"..",
"results",
"consensus",
"pbta-snv-mutation-tmb-coding.tsv"
)) %>%
# This variable is weird when binding but we don't need it for the plot so we'll just remove it.
dplyr::select(-region_size) %>%
dplyr::filter(experimental_strategy != "Panel") %>%
dplyr::mutate(filter = "filter")
tmb_pbta <- dplyr::inner_join(tmb_pbta_no_filter,
dplyr::select(tmb_pbta_with_filter, Tumor_Sample_Barcode, tmb),
by = "Tumor_Sample_Barcode",
suffix = c("_no_filter", "_filter")
)
Read in TCGA data
tmb_tcga_no_filter <- data.table::fread(file.path(
"..",
"results",
"no_filter",
"tcga-snv-mutation-tmb-coding.tsv"
)) %>%
dplyr::select(-region_size) %>%
dplyr::mutate(filter = "no filter")
tmb_tcga_with_filter <- data.table::fread(file.path(
"..",
"results",
"consensus",
"tcga-snv-mutation-tmb-coding.tsv"
)) %>%
dplyr::select(-region_size) %>%
dplyr::mutate(filter = "filter")
Join these together.
tmb_tcga <- dplyr::inner_join(tmb_tcga_no_filter,
dplyr::select(tmb_tcga_with_filter, Tumor_Sample_Barcode, tmb),
by = "Tumor_Sample_Barcode",
suffix = c("_no_filter", "_filter")
)
Does the filter change a participant’s TMB?
Plot PBTA data
cor.test(tmb_tcga$tmb_filter, tmb_tcga$tmb_no_filter)
Pearson's product-moment correlation
data: tmb_tcga$tmb_filter and tmb_tcga$tmb_no_filter
t = 4296.2, df = 316, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9999893 0.9999931
sample estimates:
cor
0.9999914
tcga_cor_plot <- ggplot2::ggplot(
tmb_tcga,
ggplot2::aes(x = tmb_no_filter, y = tmb_filter, color = short_histology)
) +
ggplot2::geom_point() +
ggplot2::xlim(0, 5) +
ggplot2::ylim(0, 5) +
ggplot2::theme_classic() +
ggplot2::theme(legend.position = "none")
tcga_cor_plot

Plot TCGA data
cor.test(tmb_tcga$tmb_filter, tmb_tcga$tmb_no_filter)
Pearson's product-moment correlation
data: tmb_tcga$tmb_filter and tmb_tcga$tmb_no_filter
t = 4296.2, df = 316, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9999893 0.9999931
sample estimates:
cor
0.9999914
all_data <- dplyr::bind_rows(list(tcga = tmb_tcga, pbta = tmb_pbta), .id = "dataset") %>%
dplyr::select(tmb_filter, tmb_no_filter, dataset, Tumor_Sample_Barcode, short_histology) %>%
tidyr::gather("filter", "tmb", -Tumor_Sample_Barcode, -dataset, -short_histology)
Are some histologies affected more than others?
pbta_cor_plot +
ggplot2::facet_wrap(~short_histology)

tcga_cor_plot +
ggplot2::facet_wrap(~short_histology)

Does the filter affect the TCGA-PBTA comparison change?
all_data <- dplyr::bind_rows(list(tcga = tmb_tcga, pbta = tmb_pbta), .id = "dataset") %>%
dplyr::select(tmb_filter, tmb_no_filter, dataset, Tumor_Sample_Barcode, short_histology) %>%
tidyr::gather("filter", "tmb", -Tumor_Sample_Barcode, -dataset, -short_histology)
ggplot2::ggplot(all_data, ggplot2::aes(x = dataset, y = tmb)) +
ggforce::geom_sina() +
ggplot2::theme_classic() +
ggplot2::ylim(0, 10) +
ggplot2::theme(legend.position = "none") +
ggplot2::facet_wrap("filter")

Plot the TMB plot with no filter data
This code is directly copied from tmb-compare-tcga/tmb-compare-tcga.Rmd
.
pbta_plot <- cdf_plot(
df = tmb_pbta_no_filter,
plot_title = "PBTA",
num_col = "tmb",
group_col = "short_histology",
color = "#3BC8A2",
n_group = 5,
x_lim = c(-1.2, 1.2),
y_lim = c(0, 400),
x_lab = "",
y_lab = "Coding Mutations per Mb",
breaks = c(0, 3, 10, 30, 100, 300)
) +
ggplot2::theme(
strip.text.x = ggplot2::element_text(size = 12),
plot.margin = grid::unit(c(0.5, 0, 0.6, 0.5), "cm")
)
the condition has length > 1 and only the first element will be used
tcga_plot <- cdf_plot(
df = tmb_tcga_no_filter,
plot_title = "TCGA (Adult)",
num_col = "tmb",
group_col = "short_histology",
color = "#630882",
n_group = 5,
x_lim = c(-1.2, 1.2),
y_lim = c(0, 400),
x_lab = "",
y_lab = "Coding Mutations per Mb",
breaks = c()
) +
ggplot2::theme(
axis.title.y = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank(),
strip.text.x = ggplot2::element_text(size = 9),
plot.margin = grid::unit(c(0.5, 1, 0.1, 0), "cm")
)
the condition has length > 1 and only the first element will be used
# Put the plots together
tmb_plot <- cowplot::plot_grid(pbta_plot, tcga_plot,
align = "v",
axis = "left",
rel_widths = c(2.5, 1),
label_size = 12
)
Removed 3 rows containing missing values (geom_point).Removed 1 rows containing missing values (geom_point).
# Save the plot to a png
cowplot::save_plot(file.path("plots", "no_filter_tmb-cdf-pbta-tcga.png"),
plot = tmb_plot, base_width = 35, base_height = 20, unit = "cm"
)
Print from png
No filter TMB Plot

With filter TMB Plot
## Session Info
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 pillar_1.4.2 compiler_3.6.0 base64enc_0.1-3 tools_3.6.0
[6] digest_0.6.20 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0
[11] pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0 rstudioapi_0.10 yaml_2.2.0
[16] xfun_0.8 withr_2.1.2 styler_1.1.1 stringr_1.4.0 dplyr_0.8.3
[21] knitr_1.23 cowplot_0.9.4 grid_3.6.0 tidyselect_0.2.5 glue_1.3.1
[26] data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 polyclip_1.10-0 rematch2_2.0.1
[31] ggplot2_3.2.0 purrr_0.3.2 tidyr_0.8.3 tweenr_1.0.1 farver_1.1.0
[36] magrittr_1.5 backports_1.1.4 scales_1.0.0 htmltools_0.3.6 MASS_7.3-51.4
[41] assertthat_0.2.1 ggforce_0.2.2 colorspace_1.4-1 labeling_0.3 stringi_1.4.3
[46] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
---
title: "Explore impact of Non Synonymous Filters"
output: 
  html_notebook:
    toc: TRUE
    toc_float: TRUE
author: C. Savonen for ALSF CCDL
date: 2020
---

### Summary of Findings:

As far as the TMB comparisons go, it doesn't matter if we use `Variant_Classification` filters or not. 
They are highly correlated no matter what and the same general TCGA to PBTA differences seem to persist. 

### Usage

On AWS, if both `run_caller_consensus_analysis-tcga.sh` and `run_caller_consensus_analysis-pbta.sh` have been run, you can run this to get this analysis ran.

```
# bash run_explorations.sh
```

<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
- [Setup](#setup)
- [Read in the TMB files](#read-in-the-tmb-files)
  - [Read in PBTA data](#read-in-pbta-data)
  - [Read in TCGA data](#read-in-tcga-data)
- [Does the filter change a participant's TMB?](#does-the-filter-change-a-participants-tmb)
  - [Plot PBTA data](#plot-pbta-data)
  - [Plot TCGA data](#plot-tcga-data)
- [Are some histologies affected more than others?](#are-some-histologies-affected-more-than-others)
- [Does the filter affect the TCGA-PBTA comparison change?](#does-the-filter-affect-the-tcga-pbta-comparison-change)
- [Plot the TMB plot with no filter data](#plot-the-tmb-plot-with-no-filter-data)
- [Session Info](#session-info)

<!-- END doctoc generated TOC please keep comment here to allow auto update -->

## Setup

```{r}
# Magrittr pipe
`%>%` <- dplyr::`%>%`
```

```{r}
source(file.path("..", "..", "tmb-compare-tcga", "util", "cdf-plot-function.R"))
```

```{r}
dir.create("plots", showWarnings = FALSE)
```

## Read in the TMB files

### Read in PBTA data

```{r}
tmb_pbta_no_filter <- data.table::fread(file.path(
  "..",
  "results",
  "no_filter",
  "pbta-snv-mutation-tmb-coding.tsv"
)) %>%
  # This variable is weird when binding but we don't need it for the plot so we'll just remove it.
  dplyr::select(-region_size) %>%
  dplyr::filter(experimental_strategy != "Panel") %>%
  dplyr::mutate(filter = "no filter")
```

```{r}
tmb_pbta_with_filter <- data.table::fread(file.path(
  "..",
  "results",
  "consensus",
  "pbta-snv-mutation-tmb-coding.tsv"
)) %>%
  # This variable is weird when binding but we don't need it for the plot so we'll just remove it.
  dplyr::select(-region_size) %>%
  dplyr::filter(experimental_strategy != "Panel") %>%
  dplyr::mutate(filter = "filter")
```

```{r}
tmb_pbta <- dplyr::inner_join(tmb_pbta_no_filter,
  dplyr::select(tmb_pbta_with_filter, Tumor_Sample_Barcode, tmb),
  by = "Tumor_Sample_Barcode",
  suffix = c("_no_filter", "_filter")
)
```

### Read in TCGA data

```{r}
tmb_tcga_no_filter <- data.table::fread(file.path(
  "..",
  "results",
  "no_filter",
  "tcga-snv-mutation-tmb-coding.tsv"
)) %>%
  dplyr::select(-region_size) %>%
  dplyr::mutate(filter = "no filter")
```

```{r}
tmb_tcga_with_filter <- data.table::fread(file.path(
  "..",
  "results",
  "consensus",
  "tcga-snv-mutation-tmb-coding.tsv"
)) %>%
  dplyr::select(-region_size) %>%
  dplyr::mutate(filter = "filter")
```

Join these together. 

```{r}
tmb_tcga <- dplyr::inner_join(tmb_tcga_no_filter,
  dplyr::select(tmb_tcga_with_filter, Tumor_Sample_Barcode, tmb),
  by = "Tumor_Sample_Barcode",
  suffix = c("_no_filter", "_filter")
)
```

## Does the filter change a participant's TMB?

### Plot PBTA data

```{r}
cor.test(tmb_pbta$tmb_filter, tmb_pbta$tmb_no_filter)
```

```{r}
pbta_cor_plot <- ggplot2::ggplot(
  tmb_pbta,
  ggplot2::aes(x = tmb_no_filter, y = tmb_filter, color = short_histology)
) +
  ggplot2::geom_point() +
  ggplot2::xlim(0, 5) +
  ggplot2::ylim(0, 5) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none")

pbta_cor_plot
```

### Plot TCGA data

```{r}
cor.test(tmb_tcga$tmb_filter, tmb_tcga$tmb_no_filter)
```

```{r}
tcga_cor_plot <- ggplot2::ggplot(
  tmb_tcga,
  ggplot2::aes(x = tmb_no_filter, y = tmb_filter, color = short_histology)
) +
  ggplot2::geom_point() +
  ggplot2::xlim(0, 5) +
  ggplot2::ylim(0, 5) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none")

tcga_cor_plot
```

## Are some histologies affected more than others? 

```{r}
pbta_cor_plot + 
  ggplot2::facet_wrap(~short_histology)
```

```{r}
tcga_cor_plot + 
  ggplot2::facet_wrap(~short_histology)
```

## Does the filter affect the TCGA-PBTA comparison change?

```{r}
all_data <- dplyr::bind_rows(list(tcga = tmb_tcga, pbta = tmb_pbta), .id = "dataset") %>%
  dplyr::select(tmb_filter, tmb_no_filter, dataset, Tumor_Sample_Barcode, short_histology) %>%
  tidyr::gather("filter", "tmb", -Tumor_Sample_Barcode, -dataset, -short_histology)
```

```{r}
ggplot2::ggplot(all_data, ggplot2::aes(x = dataset, y = tmb)) +
  ggforce::geom_sina() +
  ggplot2::theme_classic() +
  ggplot2::ylim(0, 10) +
  ggplot2::theme(legend.position = "none") +
  ggplot2::facet_wrap("filter")
```

## Plot the TMB plot with no filter data

This code is directly copied from `tmb-compare-tcga/tmb-compare-tcga.Rmd`.

```{r}
pbta_plot <- cdf_plot(
  df = tmb_pbta_no_filter,
  plot_title = "PBTA",
  num_col = "tmb",
  group_col = "short_histology",
  color = "#3BC8A2",
  n_group = 5,
  x_lim = c(-1.2, 1.2),
  y_lim = c(0, 400),
  x_lab = "",
  y_lab = "Coding Mutations per Mb", 
  breaks = c(0, 3, 10, 30, 100, 300)
) +
  ggplot2::theme(
    strip.text.x = ggplot2::element_text(size = 12), 
    plot.margin = grid::unit(c(0.5, 0, 0.6, 0.5), "cm")
  )
```

```{r}
tcga_plot <- cdf_plot(
  df = tmb_tcga_no_filter,
  plot_title = "TCGA (Adult)",
  num_col = "tmb",
  group_col = "short_histology",
  color = "#630882",
  n_group = 5,
  x_lim = c(-1.2, 1.2),
  y_lim = c(0, 400),
  x_lab = "",
  y_lab = "Coding Mutations per Mb",
  breaks = c()
) +
  ggplot2::theme(
    axis.title.y = ggplot2::element_blank(),
    axis.text.y = ggplot2::element_blank(),
    axis.ticks.y = ggplot2::element_blank(),
    strip.text.x = ggplot2::element_text(size = 9), 
    plot.margin = grid::unit(c(0.5, 1, 0.1, 0), "cm")
  )
```

```{r}
# Put the plots together
tmb_plot <- cowplot::plot_grid(pbta_plot, tcga_plot,
  align = "v",
  axis = "left",
  rel_widths = c(2.5, 1),
  label_size = 12
)
```

```{r}
# Save the plot to a png
cowplot::save_plot(file.path("plots", "no_filter_tmb-cdf-pbta-tcga.png"),
  plot = tmb_plot, base_width = 35, base_height = 20, unit = "cm"
)
```

Print from png

### No filter TMB Plot
![](./plots/no_filter_tmb-cdf-pbta-tcga.png)

### With filter TMB Plot
![](../../tmb-compare-tcga/plots/tmb-cdf-pbta-tcga.png)
## Session Info

```{r}
sessionInfo()
```
