diff --git a/tools/gsc_filter_cells/.shed.yml b/tools/gsc_filter_cells/.shed.yml
index 788b0b18f..2d8fae13e 100644
--- a/tools/gsc_filter_cells/.shed.yml
+++ b/tools/gsc_filter_cells/.shed.yml
@@ -10,6 +10,6 @@ long_description: |
categories:
- Transcriptomics
homepage_url: http://artbio.fr
-remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_cells
+remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells
toolshed:
- toolshed
diff --git a/tools/gsc_filter_cells/filter_cells.R b/tools/gsc_filter_cells/filter_cells.R
index d6d07523d..d760e5e1c 100644
--- a/tools/gsc_filter_cells/filter_cells.R
+++ b/tools/gsc_filter_cells/filter_cells.R
@@ -1,96 +1,107 @@
# First step of the signature-based workflow
-# Remove low quality cells below a user-fixed cutoff of
+# Remove low quality cells below a user-fixed cutoff of
# percentiles or raw values of number of genes detected or
# total aligned reads
-# Example of command (that generates output files) :
-# Rscript filter_cells.R -f --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot --output --output_metada
+options(show.error.messages = FALSE,
+ error = function() {
+ cat(geterrmessage(), file = stderr())
+ q("no", 1, FALSE)
+ }
+)
-# load packages that are provided in the conda env
-options( show.error.messages=F,
- error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
warnings()
+
library(optparse)
library(ggplot2)
# Arguments
-option_list = list(
- make_option(c("-f", "--file"), default=NA, type='character',
- help="Input file that contains values to filter"),
- make_option("--sep", default="\t", type='character',
- help="File column separator [default : '%default' ]"),
- make_option("--percentile_genes", default=0, type='integer',
- help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
- make_option("--percentile_counts", default=0, type='integer',
- help="nth Percentile of the total counts per cell distribution [default : '%default' ]"),
- make_option("--absolute_genes", default=0, type='integer',
- help="Remove cells that didn't express at least this number of genes [default : '%default' ]"),
- make_option("--absolute_counts", default=0, type='integer',
- help="Number of transcript threshold for cell filtering [default : '%default' ]"),
- make_option("--manage_cutoffs", default="intersect", type='character',
- help="combine or intersect cutoffs for filtering"),
- make_option("--pdfplot", type = 'character',
- help="Path to pdf file of the plots"),
- make_option("--output", type = 'character',
- help="Path to tsv file of filtered cell data"),
- make_option("--output_metada", type = 'character',
- help="Path to tsv file of filtered cell metadata")
+option_list <- list(
+ make_option(c("-f", "--file"), default = NA, type = "character",
+ help = "Input file that contains values to filter"),
+ make_option("--sep", default = "\t", type = "character",
+ help = "File column separator [default : '%default' ]"),
+ make_option("--percentile_genes", default = 0, type = "integer",
+ help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
+ make_option("--percentile_counts", default = 0, type = "integer",
+ help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"),
+ make_option("--absolute_genes", default = 0, type = "integer",
+ help = "Remove cells that did not express at least this number of genes [default : '%default' ]"),
+ make_option("--absolute_counts", default = 0, type = "integer",
+ help = "Number of transcript threshold for cell filtering [default : '%default' ]"),
+ make_option("--manage_cutoffs", default = "intersect", type = "character",
+ help = "combine or intersect cutoffs for filtering"),
+ make_option("--pdfplot", type = "character",
+ help = "Path to pdf file of the plots"),
+ make_option("--output", type = "character",
+ help = "Path to tsv file of filtered cell data"),
+ make_option("--output_metada", type = "character",
+ help = "Path to tsv file of filtered cell metadata")
+)
+opt <- parse_args(OptionParser(option_list = option_list),
+ args = commandArgs(trailingOnly = TRUE)
)
-opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE))
-if (opt$sep == "tab") {opt$sep = "\t"}
-if (opt$sep == "comma") {opt$sep = ","}
-if (opt$sep == "space") {opt$sep = " "}
-
-
-# check consistency of filtering options
-if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) {
- opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) {
-# opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) {
- opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) {
-# opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering
+if (opt$sep == "tab") {
+ opt$sep <- "\t"
+}
+if (opt$sep == "comma") {
+ opt$sep <- ","
+}
+if (opt$sep == "space") {
+ opt$sep <- " "
+}
+
+
+## check consistency of filtering options
+
+# if input parameters are not consistent (one or either method, not both), no filtering
+if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) {
+ opt$percentile_counts <- 0
+}
+
+# if input parameters are not consistent (one or either method, not both), no filtering
+if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) {
+ opt$percentile_genes <- 0
+}
# Import datasets
-data.counts <- read.table(
+data_counts <- read.delim(
opt$file,
header = TRUE,
- stringsAsFactors = F,
+ stringsAsFactors = FALSE,
sep = opt$sep,
check.names = FALSE,
row.names = 1
)
-QC_metrics <-
- data.frame(cell_id = colnames(data.counts),
- nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell
- total_counts = colSums(data.counts), # total_counts : Total counts per cell
- stringsAsFactors = F)
-
-plot_hist <- function(mydata, variable, title, cutoff){
- mybinwidth = round(max(mydata[, variable]) * 5 / 100)
- mylabel = paste0("cutoff= ", cutoff)
- hist_plot <- qplot(
- mydata[, variable],
- main = title,
- xlab = variable,
- geom="histogram",
- binwidth = mybinwidth,
- col = I("white")) +
+QC_metrics <- data.frame(cell_id = colnames(data_counts),
+ nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell
+ total_counts = colSums(data_counts), # total_counts is Total counts per cell
+ stringsAsFactors = FALSE)
+
+
+plot_hist <- function(mydata, variable, title, cutoff) {
+ mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
+ mylabel <- paste0("cutoff= ", cutoff)
+ hist_plot <- ggplot(as.data.frame(mydata[, variable]),
+ aes(x = mydata[, variable], colour = I("white"))) +
+ geom_histogram(binwidth = mybinwidth) +
+ labs(title = title, x = variable, y = "count") +
+ scale_x_continuous() +
geom_vline(xintercept = cutoff) +
- annotate(geom="text",
- x=cutoff + mybinwidth, y=1,
- label=mylabel, color="white")
- plot(hist_plot)
+ annotate(geom = "text",
+ x = cutoff + mybinwidth, y = 1,
+ label = mylabel, color = "white")
+ plot(hist_plot)
+ return
}
-# returns the highest value such as the sum of the ordered values including this highest value
-# is lower (below) than the percentile threshold (n)
-percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){
- p = n / 100
- percentile_threshold = quantile(qcmetrics[, variable], p)[[1]]
+# returns the highest value such as the sum of the ordered values including this highest
+# value is lower (below) than the percentile threshold (n)
+percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) {
+ p <- n / 100
+ percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
plot_hist(qcmetrics,
variable,
plot_title,
@@ -103,12 +114,11 @@ pdf(file = opt$pdfplot)
# Determine thresholds based on percentile
if (opt$percentile_counts > 0) {
- counts_threshold <- percentile_cutoff(
- opt$percentile_counts,
- QC_metrics,
- "total_counts",
- "Histogram of Aligned read counts per cell"
- )} else {
+ counts_threshold <- percentile_cutoff(opt$percentile_counts,
+ QC_metrics,
+ "total_counts",
+ "Histogram of Aligned read counts per cell")
+} else {
counts_threshold <- opt$absolute_counts
plot_hist(QC_metrics,
variable = "total_counts",
@@ -117,88 +127,94 @@ if (opt$percentile_counts > 0) {
}
if (opt$percentile_genes > 0) {
-
- genes_threshold <- percentile_cutoff(
- opt$percentile_genes,
- QC_metrics,
- "nGenes",
- "Histogram of Number of detected genes per cell"
- )} else {
+ genes_threshold <- percentile_cutoff(opt$percentile_genes,
+ QC_metrics,
+ "nGenes",
+ "Histogram of Number of detected genes per cell")
+} else {
genes_threshold <- opt$absolute_genes
plot_hist(QC_metrics,
variable = "nGenes",
- title = "Histogram of Number of detected genes per cell",
+ title = "Histogram of Number of detected genes per cell",
cutoff = genes_threshold)
}
# Filter out rows below thresholds (genes and read counts)
-if (opt$manage_cutoffs == 'union'){
- QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
+if (opt$manage_cutoffs == "union") {
+ QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
} else {
- QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
+ QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
}
## Plot the results
# Determine title from the parameter logics
-if (opt$percentile_counts > 0){
- part_one = paste0("Cells with aligned reads counts below the ",
- opt$percentile_counts,
- "th percentile of aligned read counts")} else {
- part_one = paste0("Cells with aligned read counts below ",
- opt$absolute_counts)
+if (opt$percentile_counts > 0) {
+ part_one <- paste0("Cells with aligned reads counts below the ",
+ opt$percentile_counts,
+ "th percentile of aligned read counts")
+} else {
+ part_one <- paste0("Cells with aligned read counts below ",
+ opt$absolute_counts)
}
-if(opt$percentile_genes > 0){
- part_two = paste0("with number of detected genes below the ",
- opt$percentile_genes,
- "th percentile of detected gene counts")} else {
- part_two = paste0("with number of detected genes below ",
- opt$absolute_genes)
+if (opt$percentile_genes > 0) {
+ part_two <- paste0("with number of detected genes below the ",
+ opt$percentile_genes,
+ "th percentile of detected gene counts")
+} else {
+ part_two <- paste0("with number of detected genes below ",
+ opt$absolute_genes)
}
+
if (opt$manage_cutoffs == "intersect") {
- conjunction = " and\n" } else {
- conjunction = " or\n"
+ conjunction <- " and\n"
+} else {
+ conjunction <- " or\n"
}
# plot with ggplot2
ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
- geom_point() + scale_y_log10() +
- scale_colour_discrete(name = "",
- breaks= c(FALSE, TRUE),
- labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
- paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) +
- xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") +
- geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) +
- ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
- theme(plot.title = element_text(size = 8, face = "bold"))
+ geom_point() +
+ scale_y_log10() +
+ scale_colour_discrete(name = "",
+ breaks = c(FALSE, TRUE),
+ labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
+ paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))
+ ) +
+ xlab("Detected genes per cell") +
+ ylab("Aligned reads per cell (log10 scale)") +
+ geom_vline(xintercept = genes_threshold) +
+ geom_hline(yintercept = counts_threshold) +
+ ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
+ theme(plot.title = element_text(size = 8, face = "bold"))
dev.off()
-# Retrieve identifier of kept cells
-kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered]
+# Retrieve identifier of kept_cells
+kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered]
-data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE)
+data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]),
+ data_counts[, kept_cells],
+ check.names = FALSE)
-# Save filtered cells
-write.table(
- data.counts,
+# Save filtered cells
+write.table(data_counts,
opt$output,
sep = "\t",
- quote = F,
- col.names = T,
- row.names = F
+ quote = FALSE,
+ col.names = TRUE,
+ row.names = FALSE
)
# Add QC metrics of filtered cells to a metadata file
metadata <- QC_metrics
# Save the metadata (QC metrics) file
-write.table(
- metadata,
+write.table(metadata,
opt$output_metada,
sep = "\t",
- quote = F,
- col.names = T,
- row.names = F
+ quote = FALSE,
+ col.names = TRUE,
+ row.names = FALSE
)
diff --git a/tools/gsc_filter_cells/filter_cells.xml b/tools/gsc_filter_cells/filter_cells.xml
index 0bf5a0f8c..e02bc2dad 100644
--- a/tools/gsc_filter_cells/filter_cells.xml
+++ b/tools/gsc_filter_cells/filter_cells.xml
@@ -1,8 +1,8 @@
-
+
on total aligned reads and/or number of detected genes
- r-optparse
- r-ggplot2
+ r-optparse
+ r-ggplot2
diff --git a/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf b/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf
index 46f10f0db..7561545ed 100644
Binary files a/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf and b/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf
index 458ab34bc..025f58209 100644
Binary files a/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf b/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf
index dfec7e4ad..1ad701c22 100644
Binary files a/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf and b/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf
index 8034316ab..d26055787 100644
Binary files a/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/no-filtering.pdf b/tools/gsc_filter_cells/test-data/no-filtering.pdf
index 76944743b..9a0d689bd 100644
Binary files a/tools/gsc_filter_cells/test-data/no-filtering.pdf and b/tools/gsc_filter_cells/test-data/no-filtering.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf b/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf
index 949acb35a..271d0e8ba 100644
Binary files a/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf and b/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf
index 859d4fc6d..ab3cf5799 100644
Binary files a/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf differ
diff --git a/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf b/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf
index bb10bab76..5a00735bc 100644
Binary files a/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf and b/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf differ