qc and de with new loc of test data

bcbio · Apr 23, 2024 · 8c9b26b · 8c9b26b
1 parent 3a7dee4
commit 8c9b26b
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 68 deletions.
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/params_de.R b/inst/rmarkdown/templates/rnaseq/skeleton/params_de.R
@@ -6,10 +6,10 @@ aim = 'short description'
 analyst = 'person in the core'
 
 # project params
-root = "../"
+root = "../.."
 date = "YYYYMMDD"
 column = "sample_type"
 subset_column = NA
-metadata_fn = "/n/data1/cores/bcbio/platform/test-data/rnaseq/GSE251845_colon_n3_tumor_normal/final/2024-04-18_samples/coldata.csv"
-counts_fn = '/n/data1/cores/bcbio/platform/test-data/rnaseq/GSE251845_colon_n3_tumor_normal/final/2024-04-18_samples/counts/tximport-counts.csv'
-basedir <- 'reports'
+metadata_fn = "/n/data1/cores/bcbio/platform/bcbioR-test-data/rnaseq/bcbio/coldata.csv"
+counts_fn = '/n/data1/cores/bcbio/platform/bcbioR-test-data/rnaseq/bcbio/tximport-counts.csv'
+basedir <- 'results'
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R b/inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R
@@ -1,11 +1,10 @@
 # info params
-project = "Interon_rnaseq_hbc04916"
-PI = 'Donggi Paik, Ph.D.'
-experiment = '8 human cell lines from multiple origins. Treatment are 9 conditions including untreated.'
-aim = 'Determine treatment effect within cell line and in combination with TNF'
-analyst = 'Lorena Pantano, PhD'
+project = "name_hbcXXXXX"
+PI = 'person name'
+experiment = 'short description'
+aim = 'short description'
+analyst = 'person in the core'
 
 
-metadata_fn="../meta/hbc04916_metadata_bcbio.csv"
-se_object="../counts/bcbio-se.rds"
-cols_to_show=c("cell","treatment")
+metadata_fn="/n/data1/cores/bcbio/platform/bcbioR-test-data/rnaseq/bcbio/coldata.csv"
+se_object="/n/data1/cores/bcbio/platform/bcbioR-test-data/rnaseq/bcbio/bcbio-se.rds"
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd
@@ -74,7 +74,13 @@ sanitize_datatable = function(df, ...) {
 -   Comparison: `r paste0(params$subset_value, ': ', params$numerator, ' vs. ', params$denominator)`
 
 ```{r create_filenames}
-filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
+
+if (!is.na(subset_value)){
+  filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}")
+} else {
+    filenames = str_interp("${params$numerator}_vs_${params$denominator}")
+}
+
 contrasts = c(column,params$numerator,params$denominator)
 
 name_expression_fn=file.path(root,
@@ -182,7 +188,7 @@ resLFCS <- lfcShrink(de, coef=resultsNames(de)[ncol(vars)+2], type="apeglm")
 
 res <- as.data.frame(resLFCS) %>%
   rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% 
-  relocate(gene_name) %>% rename(lfc = log2FoldChange) %>%
+  relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>%
   mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi)
 
 res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj)
@@ -228,15 +234,6 @@ universe=res %>%
   filter(!is.na(padj)) %>% pull(gene_id)
 mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL')
 
-ranks_df <- res %>% 
-  filter(padj < 0.01, !is.na(padj)) %>% # only if enough DE genes  
-  arrange((lfc)) %>% 
-  left_join(mapping, by = c("gene_id"="ENSEMBL")) %>% 
-  rename(entrez_id=ENTREZID) %>% 
-  filter(!is.na(entrez_id) & !is.na(padj))
-ranks <- ranks_df$lfc
-names(ranks) <- ranks_df$entrez_id
-
 all_in_life=list(
   msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"),
   msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
@@ -286,19 +283,27 @@ pathways_ora_all %>% sanitize_datatable()
 
 ```{r write-files}
 counts_norm=ruvset$normalizedCounts %>% as.data.frame() %>% 
-  rownames_to_column("gene_id")
-write_csv(counts_norm %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_expression_fn)
-write_csv(res %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_deg_fn)
-write_csv(pathways_long %>% 
-            mutate(subset = params$subset_value, 
-                   comparison = str_interp("${params$numerator}_vs_${params$denominator}")), 
-          name_pathways_fn)
+  rownames_to_column("gene_id") %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+
+res_for_writing <- res %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+
+pathways_for_writing <- pathways_long %>% 
+  mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}"))
+         
+if (!is.na(subset_value)){
+  counts_norm <- counts_norm %>% 
+    mutate(subset = params$subset_value)  
+  res_for_writing <- res_for_writing %>% 
+    mutate(subset = params$subset_value)
+  pathways_for_writing <- pathways_for_writing %>% 
+    mutate(subset = params$subset_value)
+}
+
+write_csv(counts_norm, name_expression_fn)
+write_csv(res_for_writing, name_deg_fn)
+write_csv(pathways_for_writing, name_pathways_fn)
 ```
 # R session
 

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/run_markdown.R
@@ -4,7 +4,7 @@ render_de <- function(numerator, denominator, subset_value = NA,
                       params_file = '../../params_de.R'){
 
   rmarkdown::render(input = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/DEG.Rmd",
-                    output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/",
+                    output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/DEG/",
                     output_format = "html_document",
                     output_file = ifelse(!is.na(subset_value), 
                                          paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'),

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/QC.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/QC.Rmd
@@ -17,7 +17,7 @@ output:
 editor_options: 
   chunk_output_type: console
 params:
-  params_file: code/params_qc.R
+  params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/params_qc.R
 ---
 
 
@@ -34,6 +34,7 @@ library(ggrepel)
 library(pheatmap)
 library(RColorBrewer)
 library(DT)
+library(pheatmap)
 ggplot2::theme_set(theme_light(base_size = 14))
 opts_chunk[["set"]](
     cache = FALSE,
@@ -113,9 +114,10 @@ sanitize_datatable = function(df, ...) {
 # Samples and metadata
 
 ```{r load_metadata}
-meta_df=read_csv(metadata_fn)
+meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>%
+  dplyr::select(-description)
 
-ggplot(meta_df, aes(cell, fill=treatment)) +
+ggplot(meta_df, aes(sample_type, fill = sample_type)) +
   geom_bar() + ylab("") + xlab("") +
   scale_fill_brewer(palette = "Set1")
 ```
@@ -126,13 +128,13 @@ se <- readRDS(se_object) #local
 
 
 metrics <- metadata(se)$metrics %>% 
-    full_join(meta_df, by = c("sample" = "sample"))
+    full_join(meta_df , by = c("sample" = "sample"))
 
 meta_sm <- meta_df %>%
   as.data.frame() %>%
   column_to_rownames("sample")
 
-meta_sm[,cols_to_show] %>% sanitize_datatable()
+meta_sm %>% sanitize_datatable()
 
 ```
 
@@ -144,9 +146,9 @@ Here, we want to see consistency and a minimum of 20 million reads.
 
 ```{r plot_total_reads}
 metrics %>%
-    ggplot(aes(x = cell,
-               y = total_reads,
-               color = treatment )) +
+    ggplot(aes(x = sample_type,
+               y = total_reads, 
+               color = sample_type)) +
     geom_point(alpha=0.5) +
     coord_flip() +
     scale_y_continuous(name = "million reads") +
@@ -168,8 +170,9 @@ The genomic mapping rate represents the percentage of reads mapping to the refer
 ```{r plot_mapping_rate}
 metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1)
 metrics %>%
-    ggplot(aes(x = cell, 
-               y = mapped_reads_pct, color = treatment)) +
+    ggplot(aes(x = sample_type, 
+               y = mapped_reads_pct, 
+               color = sample_type)) +
         geom_point() +
     coord_flip() +
     scale_color_brewer(palette = "Set1") +
@@ -193,8 +196,8 @@ genes_detected <- summarise(genes_detected,
                       
 metrics <- metrics %>%
   left_join(genes_detected, by = c("sample" = "name"))
-ggplot(metrics,aes(x = cell,
-           y = n_genes, color = treatment)) +
+ggplot(metrics,aes(x = sample_type,
+           y = n_genes, color = sample_type)) +
   geom_point() +
   coord_flip() +
   scale_color_brewer(palette = "Set1") +
@@ -213,7 +216,7 @@ This plot shows how complex the samples are. We expect samples with more reads t
 metrics %>% 
   ggplot(aes(x = total_reads, 
              y = n_genes,
-             color = treatment)) +
+             color = sample_type)) +
   geom_point()+
   scale_x_log10() +
   scale_color_brewer(palette = "Set1") +
@@ -227,9 +230,9 @@ Here we are looking for consistency, and exonic mapping rates around 70% or 75%
 
 ```{r plot_exonic_mapping_rate}
 metrics %>%
-  ggplot(aes(x = cell,
+  ggplot(aes(x = sample_type,
              y = exonic_rate * 100, 
-             color = treatment)) +
+             color = sample_type)) +
   geom_point() +
   ylab("Exonic rate %") + 
   ggtitle("Exonic mapping rate") + 
@@ -247,9 +250,9 @@ Here, we expect a low intronic mapping rate (≤ 15% - 20%)
 
 ```{r plot_intronic_mapping_rate}
 metrics %>%
-    ggplot(aes(x = cell,
+    ggplot(aes(x = sample_type,
                y = intronic_rate * 100, 
-               color = treatment)) +
+               color = sample_type)) +
   geom_point() +
   ylab("Intronic rate %") +
   ggtitle("Intronic mapping rate") + 
@@ -267,9 +270,9 @@ Here, we expect a low intergenic mapping rate, which is true for all samples.
 
 ```{r plot_intergenic_mapping_rate}
 metrics %>%
-    ggplot(aes(x = cell,
+    ggplot(aes(x = sample_type,
                y = intergenic_rate * 100, 
-               color = treatment)) +
+               color = sample_type)) +
         geom_point() +
     ylab("Intergenic rate %") +
     ggtitle("Intergenic mapping rate") + 
@@ -286,9 +289,9 @@ Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10%
 # for some bad samples it could be > 50%
 rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10
 metrics %>%
-    ggplot(aes(x = cell,
+    ggplot(aes(x = sample_type,
                y = r_rna_rate * 100, 
-               color = treatment)) +
+               color = sample_type)) +
         geom_point() +
     ylab("rRNA rate, %")+
     ylim(0, rrna_ylim) + 
@@ -303,9 +306,9 @@ There should be little bias, i.e. the values should be close to 1, or at least c
 
 ```{r plot_53_bias}
 metrics %>%
-  ggplot(aes(x = cell,
+  ggplot(aes(x = sample_type,
              y = x5_3_bias, 
-             color = treatment)) +
+             color = sample_type)) +
   geom_point() +
   ggtitle("5'-3' bias") + 
   coord_flip() +
@@ -319,7 +322,7 @@ metrics %>%
 We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar
 
 ```{r plot_counts_per_gene}
-metrics_small <- metrics %>% dplyr::select(sample, treatment)
+metrics_small <- metrics %>% dplyr::select(sample, sample_type)
 metrics_small <- left_join(sample_names, metrics_small)
 
 counts <- 
@@ -330,9 +333,9 @@ counts <-
 
 counts <- left_join(counts, metrics, by = c("name" = "sample"))
 
-ggplot(counts, aes(cell,
+ggplot(counts, aes(sample_type,
                    log2(counts+1),
-                   fill = treatment)) +
+                   fill = sample_type)) +
   geom_boxplot() + 
   scale_fill_brewer(palette = "Set1") +
   ggtitle("Counts per gene, all non-zero genes") +
@@ -355,17 +358,16 @@ raw_counts <- assays(se)[["raw"]] %>%
     filter(rowSums(.)!=0) %>%
     as.matrix()
 
-colnames(raw_counts) <- str_replace(colnames(raw_counts), "^x", "")
 vst <- vst(raw_counts)
 
 #fix samples names
 coldat_for_pca <- as.data.frame(metrics)
 rownames(coldat_for_pca) <- coldat_for_pca$sample
 coldat_for_pca <- coldat_for_pca[colnames(raw_counts),]
 pca1 <- degPCA(vst, coldat_for_pca,
-              condition = "treatment", data = T)[["plot"]]
+              condition = "sample_type", data = T)[["plot"]]
 pca2 <- degPCA(vst, coldat_for_pca,
-              condition = "cell", data = T)[["plot"]]
+              condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]]
 
 pca1 + scale_color_brewer(palette = "Set1")
 pca2 + scale_color_brewer(palette = "Set1")
@@ -377,13 +379,13 @@ variables=degCovariates(vst, coldat_for_pca)
 ```
 
 
-```{r clustering fig, fig.width = 10, fig.asp = .62, eval=FALSE}
+```{r clustering fig, fig.width = 10, fig.asp = .62}
 ## Hierarchical clustering
 
 vst_cor <- cor(vst)
 
 
-p <- pheatmap(vst_cor, annotation =coldat_for_pca[,c("cell", "treatment"), drop=F], show_rownames = T, show_colnames = T)
+p <- pheatmap(vst_cor, annotation =coldat_for_pca[,c("sample_type"), drop=F], show_rownames = T, show_colnames = T)
 p
 ```
 

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/run_markdown.R
@@ -0,0 +1,9 @@
+library(rmarkdown)
+
+rmarkdown::render("./inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/QC.Rmd",
+                  output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/reports/QC/", 
+                  clean = TRUE,
+                  output_format = "html_document",
+                  params = list(
+                    params_file = '../../params_qc.R')
+                  )
diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/results/placeholder b/inst/rmarkdown/templates/rnaseq/skeleton/results/placeholder