updated Fig names and added new supplementary figure in response to r…

…eviewer 3's comment
powellgenomicslab · Nov 17, 2021 · 1be9d9c · 1be9d9c
1 parent 62c6c1d
commit 1be9d9c
Show file tree

Hide file tree

Showing 26 changed files with 376 additions and 270 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Droplet-based single cell RNA-sequencing methods utilise microfluidics to encapsulate individual cells in water-in-oil droplets, dramatically increasing throughput compared to plate-based protocols. While encapsulating cells, droplets also capture cell-free "ambient" RNA, a complex mixture of transcripts released from damaged, stressed or dying cells. High concentrations of ambient RNA hinder identification of cell-containing droplets and degrade downstream biological interpretation, by violating the assumption that a droplet contains RNA from a single cell. The concentration of ambient RNA depends on the nature of the input cell suspension, being more abundant in freshly dissociated solid tissue and samples containing fragile cell types such as brain tissue.
 
-We developed the [dropletQC R package](https://powellgenomicslab.github.io/DropletQC/ "DropletQC R package") which utilises a novel QC metric, the nuclear fraction, to  identify droplets containing ambient RNA or damaged cells. The nuclear fraction quantifies the amount of RNA in a droplet that originated from unspliced pre-mRNA. Ambient RNA consists mostly of mature cytoplasmic mRNA and is relatively depleted of unspliced nuclear precursor mRNA. Droplets containing only ambient RNA will have a low nuclear fraction score while damaged cells, due to loss of cytoplasmic RNA, will tend to have a higher score than intact cells.
+We developed the [DropletQC R package](https://powellgenomicslab.github.io/DropletQC/ "DropletQC R package") which utilises a novel QC metric, the nuclear fraction, to  identify droplets containing ambient RNA or damaged cells. The nuclear fraction quantifies the amount of RNA in a droplet that originated from unspliced pre-mRNA. Ambient RNA consists mostly of mature cytoplasmic mRNA and is relatively depleted of unspliced nuclear precursor mRNA. Droplets containing only ambient RNA will have a low nuclear fraction score while damaged cells, due to loss of cytoplasmic RNA, will tend to have a higher score than intact cells.
 
 The schematic below illustrates how the nuclear fraction score, in combination with total UMI counts, can be used to identify empty droplets and damaged cells.
 

diff --git a/code/supplementary_figure_3.R b/code/supplementary_figure_3.R
@@ -2,112 +2,92 @@
 
 # title: Plot supplementary figures
 # author: Walter Muskovic
-# date: 2021-01-28
-# description: In this script we are going to create the supplementary figures 
-# presented in the manuscript
+# date: 2021-08-26
+# description: In this script we are going to create an additional supplementary
+# figure
 
 
 
 # Import libraries -------------------------------------------------------------
 
 suppressPackageStartupMessages({
+  library(Seurat)
   library(tidyverse)
   library(patchwork)
-  library(DropletQC)
-  #library(ggExtra)
-  #library(ggpubr)
+  library(ggpubr)
 })
 
 
 
-# Load data --------------------------------------------------------------------
+# Load data ---------------------------------------------------------------
 
+# Load expression data
 load("data_track/qc_examples.Rdata")
+pbmc <- Read10X("data/PBMC/outs/raw_feature_bc_matrix/")[,qc_examples$cell_barcode[qc_examples$sample=="PBMC"]]
+gbm <- Read10X("data/GBM/outs/raw_feature_bc_matrix/")[,qc_examples$cell_barcode[qc_examples$sample=="GBM"]]
+hl <- Read10X("data/HL/outs/raw_feature_bc_matrix/")[,qc_examples$cell_barcode[qc_examples$sample=="HL"]]
+mb <- Read10X("data/MB/outs/raw_feature_bc_matrix/")[,qc_examples$cell_barcode[qc_examples$sample=="MB"]]
 
-# Simplify unresolved clusters for plotting
-qc_examples <- qc_examples %>%
+# Get proportion of UMIs mapped to lncRNAs
+lncRNA <- data.frame(sample=c(rep("GBM", sum(qc_examples$sample=="GBM")),
+                              rep("HL", sum(qc_examples$sample=="HL")),
+                              rep("PBMC", sum(qc_examples$sample=="PBMC")),
+                              rep("MB", sum(qc_examples$sample=="MB"))),
+                     cell_barcode = c(colnames(gbm), colnames(hl), colnames(pbmc), colnames(mb)),
+                     MALAT1_NEAT1 = c(colSums(gbm[c("MALAT1","NEAT1"),]),
+                                colSums(hl[c("MALAT1","NEAT1"),]),
+                                colSums(pbmc[c("MALAT1","NEAT1"),]),
+                                colSums(mb[c("Malat1","Neat1"),])))
+
+
+# Add to qc_examples data frame
+qc_examples <- merge(qc_examples, lncRNA, by = c("cell_barcode", "sample")) %>%
+  mutate(MALAT1_NEAT1_percentage = 100*(MALAT1_NEAT1/umi_count))
+
+
+
+# Plot --------------------------------------------------------------------
+
+# Change names for plotting
+qc_examples <-
   mutate(
-    cell_type = replace(
-      cell_type,
-      cell_type == "astrocyte_unresolved_1",
-      "astrocyte_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "astrocyte_unresolved_2",
-      "astrocyte_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_1",
-      "neuron_unresolved"
+    qc_examples,
+    flag = case_when(
+      flag == "damaged_cell" ~ "damaged cell",
+      flag == "empty_droplet" ~ "empty droplet",
+      TRUE                      ~ "cell"
     ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_2",
-      "neuron_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_3",
-      "neuron_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_4",
-      "neuron_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_5",
-      "neuron_unresolved"
-    ),
-    cell_type = replace(
-      cell_type,
-      cell_type == "neuron_unresolved_6",
-      "neuron_unresolved"
+    sample = case_when(
+      sample == "GBM" ~ "Glioblastoma",
+      sample == "HL" ~ "Hodgkin's Lymphoma",
+      sample == "PBMC" ~ "PBMCs",
+      sample == "MB" ~ "Mouse brain"
     )
   )
 
-# Wrap cell type names for plotting
-qc_examples <- mutate(qc_examples,
-                      cell_type = str_replace_all(cell_type, "_", " ")) %>%
-  mutate(cell_type = str_wrap(cell_type, width = 15))
-
-
-
-# Figure S3 --------------------------------------------------------------------
-
-# Create plots showing how both the distributions of the nuclear fraction and 
-# UMI counts can be quite different for different cell types in the same sample
-
-supp3 <- function(sample_name, plot_title){
-  p1 <- ggplot(filter(qc_examples, sample==sample_name),
-              aes(x=cell_type, y=nuclear_fraction_droplet_qc)) + 
-    geom_violin()+
-    labs(title=plot_title,
-         x="Cell type", 
-         y="Nuclear fraction") + 
-    theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
-
-  p2 <- ggplot(filter(qc_examples, sample==sample_name),
-               aes(x=cell_type, y=log10_umi_count)) + 
-    geom_violin()+
-    labs(title=plot_title,
-         x="Cell type", 
-         y="log10(UMI count)") + 
-    theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
-  return(p1 + p2)
-}
-
-# Save out plot
-sample_names <- c("MB", "GBM", "PBMC", "HL")
-plot_titles <- c("Mouse brain", "Glioblastoma", "PBMCs", "Hodgkin's lymphoma")
-s1 <- map(1:4, function(x) supp3(sample_names[x], plot_titles[x]))
+# Plot
+p1 <- lapply(c("Mouse brain", "Hodgkin's Lymphoma", "Glioblastoma","PBMCs"), function(i){
+  filter(qc_examples, sample==i) %>%
+    ggplot(aes(x=flag, y=MALAT1_NEAT1_percentage, fill=flag)) +
+    geom_boxplot() +
+    facet_wrap(~sample) +
+    scale_fill_manual(values=c("damaged cell"="#88419d",
+                               "empty droplet"="#2166ac",
+                               "cell"="#b2182b")) +
+    theme(legend.position = "none") +
+    xlab("") +
+    ylab("MALAT1/NEAT1 % of UMIs")
+}) %>%
+ggarrange(plotlist = ., labels =letters[1:4], ncol = 2, nrow = 2)
+
+# Save out figure
 ggsave(filename = str_glue('figures/Figure_S3.png'),
-       plot = s1[[1]] / s1[[2]] / s1[[3]] / s1[[4]],
-       device = "png", width = 35, height = 30, units = "cm")
+       plot = p1,
+       device = "png",
+       width = 30, height = 20, units = "cm")
+
 ggsave(filename = str_glue('figures/Figure_S3.pdf'),
-       plot = s1[[1]] / s1[[2]] / s1[[3]] / s1[[4]],
-       device = "pdf", width = 35, height = 30, units = "cm")
-print("Finished creating supplementary figures")
+       plot = p1,
+       device = "pdf",
+       width = 30, height = 20, units = "cm")
+
diff --git a/code/supplementary_figure_4.R b/code/supplementary_figure_4.R
@@ -0,0 +1,113 @@
+# Script information -----------------------------------------------------------
+
+# title: Plot supplementary figures
+# author: Walter Muskovic
+# date: 2021-01-28
+# description: In this script we are going to create the supplementary figures 
+# presented in the manuscript
+
+
+
+# Import libraries -------------------------------------------------------------
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(patchwork)
+  library(DropletQC)
+  #library(ggExtra)
+  #library(ggpubr)
+})
+
+
+
+# Load data --------------------------------------------------------------------
+
+load("data_track/qc_examples.Rdata")
+
+# Simplify unresolved clusters for plotting
+qc_examples <- qc_examples %>%
+  mutate(
+    cell_type = replace(
+      cell_type,
+      cell_type == "astrocyte_unresolved_1",
+      "astrocyte_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "astrocyte_unresolved_2",
+      "astrocyte_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_1",
+      "neuron_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_2",
+      "neuron_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_3",
+      "neuron_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_4",
+      "neuron_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_5",
+      "neuron_unresolved"
+    ),
+    cell_type = replace(
+      cell_type,
+      cell_type == "neuron_unresolved_6",
+      "neuron_unresolved"
+    )
+  )
+
+# Wrap cell type names for plotting
+qc_examples <- mutate(qc_examples,
+                      cell_type = str_replace_all(cell_type, "_", " ")) %>%
+  mutate(cell_type = str_wrap(cell_type, width = 15))
+
+
+
+# Figure S3 --------------------------------------------------------------------
+
+# Create plots showing how both the distributions of the nuclear fraction and 
+# UMI counts can be quite different for different cell types in the same sample
+
+supp3 <- function(sample_name, plot_title){
+  p1 <- ggplot(filter(qc_examples, sample==sample_name),
+              aes(x=cell_type, y=nuclear_fraction_droplet_qc)) + 
+    geom_violin()+
+    labs(title=plot_title,
+         x="Cell type", 
+         y="Nuclear fraction") + 
+    theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
+
+  p2 <- ggplot(filter(qc_examples, sample==sample_name),
+               aes(x=cell_type, y=log10_umi_count)) + 
+    geom_violin()+
+    labs(title=plot_title,
+         x="Cell type", 
+         y="log10(UMI count)") + 
+    theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
+  return(p1 + p2)
+}
+
+# Save out plot
+sample_names <- c("MB", "GBM", "PBMC", "HL")
+plot_titles <- c("Mouse brain", "Glioblastoma", "PBMCs", "Hodgkin's lymphoma")
+s1 <- map(1:4, function(x) supp3(sample_names[x], plot_titles[x]))
+ggsave(filename = str_glue('figures/Figure_S4.png'),
+       plot = s1[[1]] / s1[[2]] / s1[[3]] / s1[[4]],
+       device = "png", width = 35, height = 30, units = "cm")
+ggsave(filename = str_glue('figures/Figure_S4.pdf'),
+       plot = s1[[1]] / s1[[2]] / s1[[3]] / s1[[4]],
+       device = "pdf", width = 35, height = 30, units = "cm")
+print("Finished creating supplementary figures")
diff --git a/code/supplementary_figure_4.sh b/code/supplementary_figure_4.sh
@@ -6,11 +6,8 @@
 #$ -q short.q
 #$ -r yes
 #$ -l mem_requested=4G
-#$ -N supp_figure_2
+#$ -N supp_figures
 
-
-echo "Started running scripts to create supplementary figure 4 for the manuscript"
-
-qsub get_cryo_microglia_data.R
-qsub -hold_jid get_data align_cryo_microglia_data.R
-Rscript plot_cryo_microglia_data.R
+conda activate Seurat
+echo "Started running script to create supplementary figure 4 for the manuscript"
+Rscript supplementary_figure_4.R 
diff --git a/code/supplementary_figure_5.R b/code/supplementary_figure_5.R