Skip to content

Commit

Permalink
upgrade gsc_filter_cells tool (#645)
Browse files Browse the repository at this point in the history
* update path in .shed.yml

* upgrade conda packages (R 4.3.1)

* lint xml

* Refactor a ggplot function (deprecated qplot) in  R script

* Replace read.table by read.delim in R script

* Intensively lint filter_cells.R

* fix nasty bug in options statement ! = not <-  !

* update pdf test files (which changed with the new ggplot version
  • Loading branch information
drosofff authored Oct 16, 2023
1 parent 41ba543 commit fdfb8de
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 127 deletions.
2 changes: 1 addition & 1 deletion tools/gsc_filter_cells/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ long_description: |
categories:
- Transcriptomics
homepage_url: http://artbio.fr
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_cells
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells
toolshed:
- toolshed
262 changes: 139 additions & 123 deletions tools/gsc_filter_cells/filter_cells.R
Original file line number Diff line number Diff line change
@@ -1,96 +1,107 @@
# First step of the signature-based workflow
# Remove low quality cells below a user-fixed cutoff of
# Remove low quality cells below a user-fixed cutoff of
# percentiles or raw values of number of genes detected or
# total aligned reads

# Example of command (that generates output files) :
# Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf> --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv>
options(show.error.messages = FALSE,
error = function() {
cat(geterrmessage(), file = stderr())
q("no", 1, FALSE)
}
)

# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
warnings()

library(optparse)
library(ggplot2)

# Arguments
option_list = list(
make_option(c("-f", "--file"), default=NA, type='character',
help="Input file that contains values to filter"),
make_option("--sep", default="\t", type='character',
help="File column separator [default : '%default' ]"),
make_option("--percentile_genes", default=0, type='integer',
help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
make_option("--percentile_counts", default=0, type='integer',
help="nth Percentile of the total counts per cell distribution [default : '%default' ]"),
make_option("--absolute_genes", default=0, type='integer',
help="Remove cells that didn't express at least this number of genes [default : '%default' ]"),
make_option("--absolute_counts", default=0, type='integer',
help="Number of transcript threshold for cell filtering [default : '%default' ]"),
make_option("--manage_cutoffs", default="intersect", type='character',
help="combine or intersect cutoffs for filtering"),
make_option("--pdfplot", type = 'character',
help="Path to pdf file of the plots"),
make_option("--output", type = 'character',
help="Path to tsv file of filtered cell data"),
make_option("--output_metada", type = 'character',
help="Path to tsv file of filtered cell metadata")
option_list <- list(
make_option(c("-f", "--file"), default = NA, type = "character",
help = "Input file that contains values to filter"),
make_option("--sep", default = "\t", type = "character",
help = "File column separator [default : '%default' ]"),
make_option("--percentile_genes", default = 0, type = "integer",
help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
make_option("--percentile_counts", default = 0, type = "integer",
help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"),
make_option("--absolute_genes", default = 0, type = "integer",
help = "Remove cells that did not express at least this number of genes [default : '%default' ]"),
make_option("--absolute_counts", default = 0, type = "integer",
help = "Number of transcript threshold for cell filtering [default : '%default' ]"),
make_option("--manage_cutoffs", default = "intersect", type = "character",
help = "combine or intersect cutoffs for filtering"),
make_option("--pdfplot", type = "character",
help = "Path to pdf file of the plots"),
make_option("--output", type = "character",
help = "Path to tsv file of filtered cell data"),
make_option("--output_metada", type = "character",
help = "Path to tsv file of filtered cell metadata")
)
opt <- parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE)
)
opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
if (opt$sep == "space") {opt$sep = " "}


# check consistency of filtering options
if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) {
opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) {
# opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) {
opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) {
# opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering
if (opt$sep == "tab") {
opt$sep <- "\t"
}
if (opt$sep == "comma") {
opt$sep <- ","
}
if (opt$sep == "space") {
opt$sep <- " "
}


## check consistency of filtering options

# if input parameters are not consistent (one or either method, not both), no filtering
if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) {
opt$percentile_counts <- 0
}

# if input parameters are not consistent (one or either method, not both), no filtering
if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) {
opt$percentile_genes <- 0
}

# Import datasets
data.counts <- read.table(
data_counts <- read.delim(
opt$file,
header = TRUE,
stringsAsFactors = F,
stringsAsFactors = FALSE,
sep = opt$sep,
check.names = FALSE,
row.names = 1
)

QC_metrics <-
data.frame(cell_id = colnames(data.counts),
nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell
total_counts = colSums(data.counts), # total_counts : Total counts per cell
stringsAsFactors = F)

plot_hist <- function(mydata, variable, title, cutoff){
mybinwidth = round(max(mydata[, variable]) * 5 / 100)
mylabel = paste0("cutoff= ", cutoff)
hist_plot <- qplot(
mydata[, variable],
main = title,
xlab = variable,
geom="histogram",
binwidth = mybinwidth,
col = I("white")) +
QC_metrics <- data.frame(cell_id = colnames(data_counts),
nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell
total_counts = colSums(data_counts), # total_counts is Total counts per cell
stringsAsFactors = FALSE)


plot_hist <- function(mydata, variable, title, cutoff) {
mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
mylabel <- paste0("cutoff= ", cutoff)
hist_plot <- ggplot(as.data.frame(mydata[, variable]),
aes(x = mydata[, variable], colour = I("white"))) +
geom_histogram(binwidth = mybinwidth) +
labs(title = title, x = variable, y = "count") +
scale_x_continuous() +
geom_vline(xintercept = cutoff) +
annotate(geom="text",
x=cutoff + mybinwidth, y=1,
label=mylabel, color="white")
plot(hist_plot)
annotate(geom = "text",
x = cutoff + mybinwidth, y = 1,
label = mylabel, color = "white")
plot(hist_plot)
return
}

# returns the highest value such as the sum of the ordered values including this highest value
# is lower (below) than the percentile threshold (n)
percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){
p = n / 100
percentile_threshold = quantile(qcmetrics[, variable], p)[[1]]
# returns the highest value such as the sum of the ordered values including this highest
# value is lower (below) than the percentile threshold (n)
percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) {
p <- n / 100
percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
plot_hist(qcmetrics,
variable,
plot_title,
Expand All @@ -103,12 +114,11 @@ pdf(file = opt$pdfplot)
# Determine thresholds based on percentile

if (opt$percentile_counts > 0) {
counts_threshold <- percentile_cutoff(
opt$percentile_counts,
QC_metrics,
"total_counts",
"Histogram of Aligned read counts per cell"
)} else {
counts_threshold <- percentile_cutoff(opt$percentile_counts,
QC_metrics,
"total_counts",
"Histogram of Aligned read counts per cell")
} else {
counts_threshold <- opt$absolute_counts
plot_hist(QC_metrics,
variable = "total_counts",
Expand All @@ -117,88 +127,94 @@ if (opt$percentile_counts > 0) {
}

if (opt$percentile_genes > 0) {

genes_threshold <- percentile_cutoff(
opt$percentile_genes,
QC_metrics,
"nGenes",
"Histogram of Number of detected genes per cell"
)} else {
genes_threshold <- percentile_cutoff(opt$percentile_genes,
QC_metrics,
"nGenes",
"Histogram of Number of detected genes per cell")
} else {
genes_threshold <- opt$absolute_genes
plot_hist(QC_metrics,
variable = "nGenes",
title = "Histogram of Number of detected genes per cell",
title = "Histogram of Number of detected genes per cell",
cutoff = genes_threshold)
}

# Filter out rows below thresholds (genes and read counts)
if (opt$manage_cutoffs == 'union'){
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
if (opt$manage_cutoffs == "union") {
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
} else {
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
}

## Plot the results

# Determine title from the parameter logics
if (opt$percentile_counts > 0){
part_one = paste0("Cells with aligned reads counts below the ",
opt$percentile_counts,
"th percentile of aligned read counts")} else {
part_one = paste0("Cells with aligned read counts below ",
opt$absolute_counts)
if (opt$percentile_counts > 0) {
part_one <- paste0("Cells with aligned reads counts below the ",
opt$percentile_counts,
"th percentile of aligned read counts")
} else {
part_one <- paste0("Cells with aligned read counts below ",
opt$absolute_counts)
}

if(opt$percentile_genes > 0){
part_two = paste0("with number of detected genes below the ",
opt$percentile_genes,
"th percentile of detected gene counts")} else {
part_two = paste0("with number of detected genes below ",
opt$absolute_genes)
if (opt$percentile_genes > 0) {
part_two <- paste0("with number of detected genes below the ",
opt$percentile_genes,
"th percentile of detected gene counts")
} else {
part_two <- paste0("with number of detected genes below ",
opt$absolute_genes)
}

if (opt$manage_cutoffs == "intersect") {
conjunction = " and\n" } else {
conjunction = " or\n"
conjunction <- " and\n"
} else {
conjunction <- " or\n"
}

# plot with ggplot2
ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
geom_point() + scale_y_log10() +
scale_colour_discrete(name = "",
breaks= c(FALSE, TRUE),
labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) +
xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") +
geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) +
ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
theme(plot.title = element_text(size = 8, face = "bold"))
geom_point() +
scale_y_log10() +
scale_colour_discrete(name = "",
breaks = c(FALSE, TRUE),
labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))
) +
xlab("Detected genes per cell") +
ylab("Aligned reads per cell (log10 scale)") +
geom_vline(xintercept = genes_threshold) +
geom_hline(yintercept = counts_threshold) +
ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
theme(plot.title = element_text(size = 8, face = "bold"))

dev.off()

# Retrieve identifier of kept cells
kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered]
# Retrieve identifier of kept_cells
kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered]

data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE)
data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]),
data_counts[, kept_cells],
check.names = FALSE)

# Save filtered cells
write.table(
data.counts,
# Save filtered cells
write.table(data_counts,
opt$output,
sep = "\t",
quote = F,
col.names = T,
row.names = F
quote = FALSE,
col.names = TRUE,
row.names = FALSE
)

# Add QC metrics of filtered cells to a metadata file
metadata <- QC_metrics

# Save the metadata (QC metrics) file
write.table(
metadata,
write.table(metadata,
opt$output_metada,
sep = "\t",
quote = F,
col.names = T,
row.names = F
quote = FALSE,
col.names = TRUE,
row.names = FALSE
)
6 changes: 3 additions & 3 deletions tools/gsc_filter_cells/filter_cells.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<tool id="filter_cells" name="Filter cells data" version="0.9.2">
<tool id="filter_cells" name="Filter cells data" version="4.3.1+galaxy0" profile="21.01">
<description>on total aligned reads and/or number of detected genes</description>
<requirements>
<requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement>
<requirement type="package" version="2.2.1=r3.3.2_0">r-ggplot2</requirement>
<requirement type="package" version="1.7.3">r-optparse</requirement>
<requirement type="package" version="3.4.4">r-ggplot2</requirement>
</requirements>
<stdio>
<exit_code range="1:" level="fatal" description="Tool exception" />
Expand Down
Binary file modified tools/gsc_filter_cells/test-data/absolute_counts-only.pdf
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/absolute_gene-and-counts.pdf
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/absolute_gene-only.pdf
Binary file not shown.
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/no-filtering.pdf
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/percentile_counts-only.pdf
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/percentile_gene-and-counts.pdf
Binary file not shown.
Binary file modified tools/gsc_filter_cells/test-data/percentile_gene-only.pdf
Binary file not shown.

0 comments on commit fdfb8de

Please sign in to comment.