diff --git a/tools/gsc_filter_cells/.shed.yml b/tools/gsc_filter_cells/.shed.yml index 788b0b18f..2d8fae13e 100644 --- a/tools/gsc_filter_cells/.shed.yml +++ b/tools/gsc_filter_cells/.shed.yml @@ -10,6 +10,6 @@ long_description: | categories: - Transcriptomics homepage_url: http://artbio.fr -remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_cells +remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells toolshed: - toolshed diff --git a/tools/gsc_filter_cells/filter_cells.R b/tools/gsc_filter_cells/filter_cells.R index d6d07523d..d760e5e1c 100644 --- a/tools/gsc_filter_cells/filter_cells.R +++ b/tools/gsc_filter_cells/filter_cells.R @@ -1,96 +1,107 @@ # First step of the signature-based workflow -# Remove low quality cells below a user-fixed cutoff of +# Remove low quality cells below a user-fixed cutoff of # percentiles or raw values of number of genes detected or # total aligned reads -# Example of command (that generates output files) : -# Rscript filter_cells.R -f --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot --output --output_metada +options(show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } +) -# load packages that are provided in the conda env -options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() + library(optparse) library(ggplot2) # Arguments -option_list = list( - make_option(c("-f", "--file"), default=NA, type='character', - help="Input file that contains values to filter"), - make_option("--sep", default="\t", type='character', - help="File column separator [default : '%default' ]"), - make_option("--percentile_genes", default=0, type='integer', - help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), - make_option("--percentile_counts", default=0, type='integer', - help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), - make_option("--absolute_genes", default=0, type='integer', - help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), - make_option("--absolute_counts", default=0, type='integer', - help="Number of transcript threshold for cell filtering [default : '%default' ]"), - make_option("--manage_cutoffs", default="intersect", type='character', - help="combine or intersect cutoffs for filtering"), - make_option("--pdfplot", type = 'character', - help="Path to pdf file of the plots"), - make_option("--output", type = 'character', - help="Path to tsv file of filtered cell data"), - make_option("--output_metada", type = 'character', - help="Path to tsv file of filtered cell metadata") +option_list <- list( + make_option(c("-f", "--file"), default = NA, type = "character", + help = "Input file that contains values to filter"), + make_option("--sep", default = "\t", type = "character", + help = "File column separator [default : '%default' ]"), + make_option("--percentile_genes", default = 0, type = "integer", + help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), + make_option("--percentile_counts", default = 0, type = "integer", + help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"), + make_option("--absolute_genes", default = 0, type = "integer", + help = "Remove cells that did not express at least this number of genes [default : '%default' ]"), + make_option("--absolute_counts", default = 0, type = "integer", + help = "Number of transcript threshold for cell filtering [default : '%default' ]"), + make_option("--manage_cutoffs", default = "intersect", type = "character", + help = "combine or intersect cutoffs for filtering"), + make_option("--pdfplot", type = "character", + help = "Path to pdf file of the plots"), + make_option("--output", type = "character", + help = "Path to tsv file of filtered cell data"), + make_option("--output_metada", type = "character", + help = "Path to tsv file of filtered cell metadata") +) +opt <- parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE) ) -opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) -if (opt$sep == "tab") {opt$sep = "\t"} -if (opt$sep == "comma") {opt$sep = ","} -if (opt$sep == "space") {opt$sep = " "} - - -# check consistency of filtering options -if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { - opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering -# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { -# opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering -if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { - opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering -# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { -# opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering +if (opt$sep == "tab") { + opt$sep <- "\t" +} +if (opt$sep == "comma") { + opt$sep <- "," +} +if (opt$sep == "space") { + opt$sep <- " " +} + + +## check consistency of filtering options + +# if input parameters are not consistent (one or either method, not both), no filtering +if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) { + opt$percentile_counts <- 0 +} + +# if input parameters are not consistent (one or either method, not both), no filtering +if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) { + opt$percentile_genes <- 0 +} # Import datasets -data.counts <- read.table( +data_counts <- read.delim( opt$file, header = TRUE, - stringsAsFactors = F, + stringsAsFactors = FALSE, sep = opt$sep, check.names = FALSE, row.names = 1 ) -QC_metrics <- - data.frame(cell_id = colnames(data.counts), - nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell - total_counts = colSums(data.counts), # total_counts : Total counts per cell - stringsAsFactors = F) - -plot_hist <- function(mydata, variable, title, cutoff){ - mybinwidth = round(max(mydata[, variable]) * 5 / 100) - mylabel = paste0("cutoff= ", cutoff) - hist_plot <- qplot( - mydata[, variable], - main = title, - xlab = variable, - geom="histogram", - binwidth = mybinwidth, - col = I("white")) + +QC_metrics <- data.frame(cell_id = colnames(data_counts), + nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell + total_counts = colSums(data_counts), # total_counts is Total counts per cell + stringsAsFactors = FALSE) + + +plot_hist <- function(mydata, variable, title, cutoff) { + mybinwidth <- round(max(mydata[, variable]) * 5 / 100) + mylabel <- paste0("cutoff= ", cutoff) + hist_plot <- ggplot(as.data.frame(mydata[, variable]), + aes(x = mydata[, variable], colour = I("white"))) + + geom_histogram(binwidth = mybinwidth) + + labs(title = title, x = variable, y = "count") + + scale_x_continuous() + geom_vline(xintercept = cutoff) + - annotate(geom="text", - x=cutoff + mybinwidth, y=1, - label=mylabel, color="white") - plot(hist_plot) + annotate(geom = "text", + x = cutoff + mybinwidth, y = 1, + label = mylabel, color = "white") + plot(hist_plot) + return } -# returns the highest value such as the sum of the ordered values including this highest value -# is lower (below) than the percentile threshold (n) -percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ - p = n / 100 - percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] +# returns the highest value such as the sum of the ordered values including this highest +# value is lower (below) than the percentile threshold (n) +percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) { + p <- n / 100 + percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]] plot_hist(qcmetrics, variable, plot_title, @@ -103,12 +114,11 @@ pdf(file = opt$pdfplot) # Determine thresholds based on percentile if (opt$percentile_counts > 0) { - counts_threshold <- percentile_cutoff( - opt$percentile_counts, - QC_metrics, - "total_counts", - "Histogram of Aligned read counts per cell" - )} else { + counts_threshold <- percentile_cutoff(opt$percentile_counts, + QC_metrics, + "total_counts", + "Histogram of Aligned read counts per cell") +} else { counts_threshold <- opt$absolute_counts plot_hist(QC_metrics, variable = "total_counts", @@ -117,88 +127,94 @@ if (opt$percentile_counts > 0) { } if (opt$percentile_genes > 0) { - - genes_threshold <- percentile_cutoff( - opt$percentile_genes, - QC_metrics, - "nGenes", - "Histogram of Number of detected genes per cell" - )} else { + genes_threshold <- percentile_cutoff(opt$percentile_genes, + QC_metrics, + "nGenes", + "Histogram of Number of detected genes per cell") +} else { genes_threshold <- opt$absolute_genes plot_hist(QC_metrics, variable = "nGenes", - title = "Histogram of Number of detected genes per cell", + title = "Histogram of Number of detected genes per cell", cutoff = genes_threshold) } # Filter out rows below thresholds (genes and read counts) -if (opt$manage_cutoffs == 'union'){ - QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) +if (opt$manage_cutoffs == "union") { + QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) } else { - QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) + QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) } ## Plot the results # Determine title from the parameter logics -if (opt$percentile_counts > 0){ - part_one = paste0("Cells with aligned reads counts below the ", - opt$percentile_counts, - "th percentile of aligned read counts")} else { - part_one = paste0("Cells with aligned read counts below ", - opt$absolute_counts) +if (opt$percentile_counts > 0) { + part_one <- paste0("Cells with aligned reads counts below the ", + opt$percentile_counts, + "th percentile of aligned read counts") +} else { + part_one <- paste0("Cells with aligned read counts below ", + opt$absolute_counts) } -if(opt$percentile_genes > 0){ - part_two = paste0("with number of detected genes below the ", - opt$percentile_genes, - "th percentile of detected gene counts")} else { - part_two = paste0("with number of detected genes below ", - opt$absolute_genes) +if (opt$percentile_genes > 0) { + part_two <- paste0("with number of detected genes below the ", + opt$percentile_genes, + "th percentile of detected gene counts") +} else { + part_two <- paste0("with number of detected genes below ", + opt$absolute_genes) } + if (opt$manage_cutoffs == "intersect") { - conjunction = " and\n" } else { - conjunction = " or\n" + conjunction <- " and\n" +} else { + conjunction <- " or\n" } # plot with ggplot2 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + - geom_point() + scale_y_log10() + - scale_colour_discrete(name = "", - breaks= c(FALSE, TRUE), - labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), - paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + - xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + - geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + - ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + - theme(plot.title = element_text(size = 8, face = "bold")) + geom_point() + + scale_y_log10() + + scale_colour_discrete(name = "", + breaks = c(FALSE, TRUE), + labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), + paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)")) + ) + + xlab("Detected genes per cell") + + ylab("Aligned reads per cell (log10 scale)") + + geom_vline(xintercept = genes_threshold) + + geom_hline(yintercept = counts_threshold) + + ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) + + theme(plot.title = element_text(size = 8, face = "bold")) dev.off() -# Retrieve identifier of kept cells -kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] +# Retrieve identifier of kept_cells +kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered] -data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) +data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]), + data_counts[, kept_cells], + check.names = FALSE) -# Save filtered cells -write.table( - data.counts, +# Save filtered cells +write.table(data_counts, opt$output, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) # Add QC metrics of filtered cells to a metadata file metadata <- QC_metrics # Save the metadata (QC metrics) file -write.table( - metadata, +write.table(metadata, opt$output_metada, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) diff --git a/tools/gsc_filter_cells/filter_cells.xml b/tools/gsc_filter_cells/filter_cells.xml index 0bf5a0f8c..e02bc2dad 100644 --- a/tools/gsc_filter_cells/filter_cells.xml +++ b/tools/gsc_filter_cells/filter_cells.xml @@ -1,8 +1,8 @@ - + on total aligned reads and/or number of detected genes - r-optparse - r-ggplot2 + r-optparse + r-ggplot2 diff --git a/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf b/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf index 46f10f0db..7561545ed 100644 Binary files a/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf and b/tools/gsc_filter_cells/test-data/absolute_counts-only.pdf differ diff --git a/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf index 458ab34bc..025f58209 100644 Binary files a/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf differ diff --git a/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf b/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf index dfec7e4ad..1ad701c22 100644 Binary files a/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf and b/tools/gsc_filter_cells/test-data/absolute_gene-only.pdf differ diff --git a/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf index 8034316ab..d26055787 100644 Binary files a/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/intersect_percentile_gene-and-counts.pdf differ diff --git a/tools/gsc_filter_cells/test-data/no-filtering.pdf b/tools/gsc_filter_cells/test-data/no-filtering.pdf index 76944743b..9a0d689bd 100644 Binary files a/tools/gsc_filter_cells/test-data/no-filtering.pdf and b/tools/gsc_filter_cells/test-data/no-filtering.pdf differ diff --git a/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf b/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf index 949acb35a..271d0e8ba 100644 Binary files a/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf and b/tools/gsc_filter_cells/test-data/percentile_counts-only.pdf differ diff --git a/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf b/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf index 859d4fc6d..ab3cf5799 100644 Binary files a/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf and b/tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf differ diff --git a/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf b/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf index bb10bab76..5a00735bc 100644 Binary files a/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf and b/tools/gsc_filter_cells/test-data/percentile_gene-only.pdf differ