Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reports dryclean in read count units and input signal retained in outputs #35

Merged
merged 2 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 27 additions & 31 deletions R/dryclean.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ dryclean <- R6::R6Class("dryclean",

if (length(cov) != pon.length & testing == FALSE) {
dt_mismatch = data.table(chr = c(), coverage = c(), pon = c())
for(chr in c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "\
X")){
for(chr in c(1:22, "X")){
dt_mismatch <- rbind(dt_mismatch,
data.table(
chr = chr,
Expand All @@ -140,7 +139,7 @@ X")){
private$history <- rbindlist(list(private$history, data.table(action = paste("Started drycleaning the coverage file"), date = as.character(Sys.time()))))

if (germline.filter & is.null(private$pon$get_inf_germ())){
stop("If germiline.filter is set to TRUE, pon must have a inf_germ element, see prepare_detergent for details")
stop("If germline.filter is set to TRUE, pon must have a inf_germ element, see prepare_detergent for details")
}

all.chr = c(as.character(1:22), "X")
Expand All @@ -149,23 +148,6 @@ X")){
cov = cov %Q% (seqnames %in% local.all.chr)
cov = cov[, field] %>% gr2dt() %>% setnames(., field, "signal")
cov = cov %>% dt2gr()

if(center == TRUE){


message("Median-centering the sample")
private$history <- rbindlist(list(private$history, data.table(action = paste("Median-normalization of coverage"), date = as.character(Sys.time()))))
mcols(cov)[which(is.na(mcols(cov)[, "signal"])), "signal"] = 0
mcols(cov)[which(is.infinite(mcols(cov)[, "signal"])), "signal"] = NA
if(centering == "mean"){
values(cov)[, "signal"] = values(cov)[, "signal"] / mean(values(cov)[, "signal"], na.rm = TRUE)
}
if(centering == "median"){
values(cov)[, "signal"] = values(cov)[, "signal"] / median(values(cov)[, "signal"], na.rm = TRUE)
}
}


cov = sortSeqlevels(cov)
cov = sort(cov)

Expand Down Expand Up @@ -196,12 +178,18 @@ X")){
})

}

message(paste0(centering, "-centering the sample"))
private$history <- rbindlist(list(private$history, data.table(action = paste0(centering, "normalization of coverage"), date = as.character(Sys.time()))))


m.vec = prep_cov(cov, use.blacklist = use.blacklist, blacklist = blacklist_cov)
cov = prep_cov(cov,
use.blacklist = use.blacklist,
blacklist = blacklist_cov,
center = center,
centering = centering)


m.vec = as.matrix(m.vec$signal)
m.vec = as.matrix(cov$signal)
L.burnin = private$pon$get_L()
S.burnin = private$pon$get_S()
r = private$pon$get_k()
Expand Down Expand Up @@ -231,32 +219,39 @@ X")){
}

cov = gr2dt(cov)
cov[, median.chr := median(.SD$signal, na.rm = T), by = seqnames]


if (use.blacklist){
cov_template <- cov
cov[is.na(signal), signal := median.chr]
cov[is.infinite(signal), signal := median.chr]
cov = cov[!blacklist_cov,]
}


setnames(cov, "signal", "input.read.counts")
#' retain input counts in input.read.counts field
setnames(cov, "og.signal", "input.read.counts")

#' append foreground and fix 0/NA values
cov = cbind(decomposed[[2]], cov)
colnames(cov)[1] = 'foreground.log'
cov[is.na(input.read.counts), foreground.log := NA]
cov[, foreground := exp(foreground.log)]
cov[, foreground := exp(foreground.log) * center.all]
cov[input.read.counts == 0, foreground := 0]
cov[is.na(input.read.counts), foreground := NA]

#' append background and fix 0/NA values
cov = cbind(decomposed[[1]], cov)
colnames(cov)[1] = 'background.log'
cov[is.na(input.read.counts), background.log := NA]
cov[, background := exp(background.log) ]
cov[, background := exp(background.log) * center.all]
cov[input.read.counts == 0, background := 0]
cov[is.na(input.read.counts), background := NA]

cov[, log.reads := log(input.read.counts)]
cov[is.infinite(log.reads), log.reads := NA]


scaling.factor <- sum(cov$input.read.counts) / sum(cov$foreground)
cov$foreground <- cov$foreground * scaling.factor
cov$background <- cov$background * scaling.factor

if (germline.filter){
germ.file = private$pon$get_inf_germ()
Expand All @@ -265,7 +260,8 @@ X")){
cov[germline.status == TRUE, foreground.log := NA]
cov = na.omit(cov)
}


#browser()
cov = dt2gr(cov)

if(use.blacklist){
Expand Down
46 changes: 23 additions & 23 deletions R/helper_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,27 +223,43 @@ wash_cycle <- function(m.vec, L.burnin, S.burnin , r, N, U.hat, V.hat, sigma.hat
##############################
#' @name prep_cov
#'
#' @title function to prepare covearge data for decomposition
#' @title function to prepare covearge data for decomposition and perform normalization
#' @description prepares the GC corrected coverage data for decomposition
#'
#' @keywords internal
#'
#' @param m.vec GRanges object. GRanges containing GC corrected reads as a cloumn in metadata
#' @param m.vec GRanges object. GRanges containing GC corrected reads as a column in metadata
#'
#' @param blacklist, boolean (default == FALSE). Whether to exclude off-target markers in case of Exomes or targeted sequqnecing. If set to TRUE, needs a GRange marking if each marker is set to be excluded or not.
#'
#' @param burnin.samples.path, character (default = NA). Path to balcklist markers file
#'
#' @param center, character (default == FALSE). Transform the data to centered?
#'
#' @param centering, enum of "mean" or "median" ONLY (default = "mean"). Paradigm of centering.
#'
#' @return vector of length m with processed coverage data
#'
#' @author Aditya Deshpande
#' @author Aditya Deshpande / Johnathan Rafailov

prep_cov <- function(m.vec = m.vec, use.blacklist = FALSE, blacklist = NA){
prep_cov <- function(m.vec = m.vec, use.blacklist = FALSE, blacklist = NA, center = FALSE, centering = "mean"){

mcols(m.vec)$signal[ which(is.na(mcols(m.vec)$signal)) ] <- 0
mcols(m.vec)$signal[ which(is.infinite(mcols(m.vec)$signal)) ] <- NA
mcols(m.vec)$og.signal <- mcols(m.vec)$signal

if(center == T){
if(centering == "mean"){
m.vec$center.all <- mean(m.vec$signal, na.rm = T)
} else {
m.vec$center.all <- median(m.vec$signal, na.rm = T)
}
m.vec$signal <- m.vec$signal / m.vec$center.all
}

m.vec = gr2dt(m.vec)
m.vec = m.vec[, .(seqnames, signal)]
#m.vec = m.vec[, .(seqnames, start, end, og.signal, signal)]
m.vec[, median.chr := median(.SD$signal, na.rm = T), by = seqnames]
#return(m.vec)
m.vec[is.na(signal), signal := median.chr]

min.cov = min(m.vec[signal > 0]$signal, na.rm = T)
Expand All @@ -262,23 +278,9 @@ prep_cov <- function(m.vec = m.vec, use.blacklist = FALSE, blacklist = NA){

m.vec[, signal := log(signal)]

return(m.vec)
return(dt2gr(m.vec))
}


# collapse_cov <- function(cov.gr, bin.size = target_resolution, this.field = field){
# BINSIZE.ROUGH = bin.size
# cov.gr = cov.gr[, this.field]
# cov.gr = gr2dt(cov.gr)
# setnames(cov.gr, this.field, "signal")
# cov.gr = cov.gr[!is.infinite(signal), .(signal = median(signal, na.rm = TRUE)),
# by = .(seqnames, start = floor(start/BINSIZE.ROUGH)*BINSIZE.ROUGH+1)]
# cov.gr[, end := (start + BINSIZE.ROUGH) - 1]
# setnames(cov.gr, "signal", this.field)
# cov.gr = dt2gr(cov.gr)
# return(cov.gr)
# }

collapse_cov <- function(cov.gr, bin.size = target_resolution, this.field = field){
BINSIZE.ROUGH = bin.size
cov.gr = cov.gr[, this.field]
Expand All @@ -293,8 +295,6 @@ collapse_cov <- function(cov.gr, bin.size = target_resolution, this.field = fiel
return(cov.gr)
}



generate_template <- function(cov, wgs = wgs, target_resolution = target_resolution, this.field = field, nochr = TRUE, all.chr = c(as.character(1:22), "X")){
if (wgs){
inferred_resolution = median(width(cov), na.rm = T)
Expand Down
Loading