Summary of Findings:
As far as the TMB comparisons go, it doesn’t matter if we use Variant_Classification
filters or not. They are highly correlated no matter what and the same general TCGA to PBTA differences seem to persist.
Setup
# Magrittr pipe
`%>%` <- dplyr::`%>%`
source(file.path("..", "..", "tmb-compare-tcga", "util", "cdf-plot-function.R"))
dir.create("plots", showWarnings = FALSE)
Read in the TMB files
Read in PBTA data
tmb_pbta_no_filter <- data.table::fread(file.path(
"..",
"results",
"no_filter",
"pbta-snv-mutation-tmb-coding.tsv"
)) %>%
# This variable is weird when binding but we don't need it for the plot so we'll just remove it.
dplyr::select(-region_size) %>%
dplyr::filter(experimental_strategy != "Panel") %>%
dplyr::mutate(filter = "no filter")
tmb_pbta_with_filter <- data.table::fread(file.path(
"..",
"results",
"consensus",
"pbta-snv-mutation-tmb-coding.tsv"
)) %>%
# This variable is weird when binding but we don't need it for the plot so we'll just remove it.
dplyr::select(-region_size) %>%
dplyr::filter(experimental_strategy != "Panel") %>%
dplyr::mutate(filter = "filter")
tmb_pbta <- dplyr::inner_join(tmb_pbta_no_filter,
dplyr::select(tmb_pbta_with_filter, Tumor_Sample_Barcode, tmb),
by = "Tumor_Sample_Barcode",
suffix = c("_no_filter", "_filter")
)
Read in TCGA data
tmb_tcga_no_filter <- data.table::fread(file.path(
"..",
"results",
"no_filter",
"tcga-snv-mutation-tmb-coding.tsv"
)) %>%
dplyr::select(-region_size) %>%
dplyr::mutate(filter = "no filter")
tmb_tcga_with_filter <- data.table::fread(file.path(
"..",
"results",
"consensus",
"tcga-snv-mutation-tmb-coding.tsv"
)) %>%
dplyr::select(-region_size) %>%
dplyr::mutate(filter = "filter")
Join these together.
tmb_tcga <- dplyr::inner_join(tmb_tcga_no_filter,
dplyr::select(tmb_tcga_with_filter, Tumor_Sample_Barcode, tmb),
by = "Tumor_Sample_Barcode",
suffix = c("_no_filter", "_filter")
)
Does the filter change a participant’s TMB?
Plot PBTA data
cor.test(tmb_tcga$tmb_filter, tmb_tcga$tmb_no_filter)
Pearson's product-moment correlation
data: tmb_tcga$tmb_filter and tmb_tcga$tmb_no_filter
t = 4296.2, df = 316, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9999893 0.9999931
sample estimates:
cor
0.9999914
tcga_cor_plot <- ggplot2::ggplot(
tmb_tcga,
ggplot2::aes(x = tmb_no_filter, y = tmb_filter, color = short_histology)
) +
ggplot2::geom_point() +
ggplot2::xlim(0, 5) +
ggplot2::ylim(0, 5) +
ggplot2::theme_classic() +
ggplot2::theme(legend.position = "none")
tcga_cor_plot

Plot TCGA data
cor.test(tmb_tcga$tmb_filter, tmb_tcga$tmb_no_filter)
Pearson's product-moment correlation
data: tmb_tcga$tmb_filter and tmb_tcga$tmb_no_filter
t = 4296.2, df = 316, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9999893 0.9999931
sample estimates:
cor
0.9999914
all_data <- dplyr::bind_rows(list(tcga = tmb_tcga, pbta = tmb_pbta), .id = "dataset") %>%
dplyr::select(tmb_filter, tmb_no_filter, dataset, Tumor_Sample_Barcode, short_histology) %>%
tidyr::gather("filter", "tmb", -Tumor_Sample_Barcode, -dataset, -short_histology)
Are some histologies affected more than others?
pbta_cor_plot +
ggplot2::facet_wrap(~short_histology)

tcga_cor_plot +
ggplot2::facet_wrap(~short_histology)

Does the filter affect the TCGA-PBTA comparison change?
all_data <- dplyr::bind_rows(list(tcga = tmb_tcga, pbta = tmb_pbta), .id = "dataset") %>%
dplyr::select(tmb_filter, tmb_no_filter, dataset, Tumor_Sample_Barcode, short_histology) %>%
tidyr::gather("filter", "tmb", -Tumor_Sample_Barcode, -dataset, -short_histology)
ggplot2::ggplot(all_data, ggplot2::aes(x = dataset, y = tmb)) +
ggforce::geom_sina() +
ggplot2::theme_classic() +
ggplot2::ylim(0, 10) +
ggplot2::theme(legend.position = "none") +
ggplot2::facet_wrap("filter")

Plot the TMB plot with no filter data
This code is directly copied from tmb-compare-tcga/tmb-compare-tcga.Rmd
.
pbta_plot <- cdf_plot(
df = tmb_pbta_no_filter,
plot_title = "PBTA",
num_col = "tmb",
group_col = "short_histology",
color = "#3BC8A2",
n_group = 5,
x_lim = c(-1.2, 1.2),
y_lim = c(0, 400),
x_lab = "",
y_lab = "Coding Mutations per Mb",
breaks = c(0, 3, 10, 30, 100, 300)
) +
ggplot2::theme(
strip.text.x = ggplot2::element_text(size = 12),
plot.margin = grid::unit(c(0.5, 0, 0.6, 0.5), "cm")
)
the condition has length > 1 and only the first element will be used
tcga_plot <- cdf_plot(
df = tmb_tcga_no_filter,
plot_title = "TCGA (Adult)",
num_col = "tmb",
group_col = "short_histology",
color = "#630882",
n_group = 5,
x_lim = c(-1.2, 1.2),
y_lim = c(0, 400),
x_lab = "",
y_lab = "Coding Mutations per Mb",
breaks = c()
) +
ggplot2::theme(
axis.title.y = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank(),
strip.text.x = ggplot2::element_text(size = 9),
plot.margin = grid::unit(c(0.5, 1, 0.1, 0), "cm")
)
the condition has length > 1 and only the first element will be used
# Put the plots together
tmb_plot <- cowplot::plot_grid(pbta_plot, tcga_plot,
align = "v",
axis = "left",
rel_widths = c(2.5, 1),
label_size = 12
)
Removed 3 rows containing missing values (geom_point).Removed 1 rows containing missing values (geom_point).
# Save the plot to a png
cowplot::save_plot(file.path("plots", "no_filter_tmb-cdf-pbta-tcga.png"),
plot = tmb_plot, base_width = 35, base_height = 20, unit = "cm"
)
Print from png
No filter TMB Plot

With filter TMB Plot
## Session Info
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 pillar_1.4.2 compiler_3.6.0 base64enc_0.1-3 tools_3.6.0
[6] digest_0.6.20 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0
[11] pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0 rstudioapi_0.10 yaml_2.2.0
[16] xfun_0.8 withr_2.1.2 styler_1.1.1 stringr_1.4.0 dplyr_0.8.3
[21] knitr_1.23 cowplot_0.9.4 grid_3.6.0 tidyselect_0.2.5 glue_1.3.1
[26] data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 polyclip_1.10-0 rematch2_2.0.1
[31] ggplot2_3.2.0 purrr_0.3.2 tidyr_0.8.3 tweenr_1.0.1 farver_1.1.0
[36] magrittr_1.5 backports_1.1.4 scales_1.0.0 htmltools_0.3.6 MASS_7.3-51.4
[41] assertthat_0.2.1 ggforce_0.2.2 colorspace_1.4-1 labeling_0.3 stringi_1.4.3
[46] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
